Fix SAA DMI parser to match real DMI.txt format

Replace the guessed pipe/key=value parser with the correct format documented in SAA User Guide 4.8.1: [Section] Item Name {SHN} = "value" // comment Handles string values (strips surrounding quotes), non-string values (UUID, hex), section headers for display names, version line, and // comments. Verified against the SAA 1.5.0 User Guide sample. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Add SAA DMI editor to Tools page
2026-06-18 15:58:02 +03:00 · 2026-06-18 15:50:42 +03:00 · 2026-06-18 14:21:24 +03:00 · 2026-06-18 11:00:02 +03:00 · 2026-06-18 10:11:52 +03:00 · 2026-06-15 16:07:54 +03:00
162 changed files with 18410 additions and 5508 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 .env
 .DS_Store
 dist/
-iso/out/
 build-cache/
+audit/bee
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -2,6 +2,7 @@ package main

 import (
 	"context"
+	"errors"
 	"flag"
 	"fmt"
 	"io"
@@ -63,14 +64,20 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 		return runExport(args[1:], stdout, stderr)
 	case "preflight":
 		return runPreflight(args[1:], stdout, stderr)
+	case "install-to-ram":
+		return runInstallToRAM(args[1:], stdout, stderr)
 	case "support-bundle":
 		return runSupportBundle(args[1:], stdout, stderr)
 	case "web":
 		return runWeb(args[1:], stdout, stderr)
+	case "blackbox":
+		return runBlackbox(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
 	case "benchmark":
 		return runBenchmark(args[1:], stdout, stderr)
+	case "bee-worker":
+		return runBeeWorker(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -85,11 +92,14 @@ func printRootUsage(w io.Writer) {
 	fmt.Fprintln(w, `bee commands:
  bee audit   --runtime auto|local|livecd --output stdout|file:<path>
  bee preflight --output stdout|file:<path>
+  bee install-to-ram
  bee export  --target <device>
  bee support-bundle --output stdout|file:<path>
  bee web     --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
+  bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
  bee benchmark nvidia [--profile standard|stability|overnight]
+  bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
  bee version
  bee help [command]`)
 }
@@ -102,14 +112,20 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runExport([]string{"--help"}, stdout, stdout)
 	case "preflight":
 		return runPreflight([]string{"--help"}, stdout, stdout)
+	case "install-to-ram":
+		return runInstallToRAM([]string{"--help"}, stdout, stdout)
 	case "support-bundle":
 		return runSupportBundle([]string{"--help"}, stdout, stdout)
 	case "web":
 		return runWeb([]string{"--help"}, stdout, stdout)
+	case "blackbox":
+		return runBlackbox([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
 	case "benchmark":
 		return runBenchmark([]string{"--help"}, stdout, stderr)
+	case "bee-worker":
+		return runBeeWorker([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -241,6 +257,32 @@ func runPreflight(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

+func runInstallToRAM(args []string, stdout, stderr io.Writer) int {
+	fs := flag.NewFlagSet("install-to-ram", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	fs.Usage = func() {
+		fmt.Fprintln(stderr, "usage: bee install-to-ram")
+	}
+	if err := fs.Parse(args); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fs.Usage()
+		return 2
+	}
+
+	application := app.New(platform.New())
+	logLine := func(s string) { fmt.Fprintln(stdout, s) }
+	if err := application.RunInstallToRAM(context.Background(), logLine); err != nil {
+		slog.Error("run install-to-ram", "err", err)
+		return 1
+	}
+	return 0
+}
+
 func runSupportBundle(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
 	fs.SetOutput(stderr)
@@ -335,6 +377,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

+func runBlackbox(args []string, stdout, stderr io.Writer) int {
+	fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
+	statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
+	fs.Usage = func() {
+		fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fs.Usage()
+		return 2
+	}
+	slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
+	if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
+		slog.Error("run blackbox", "err", err)
+		return 1
+	}
+	return 0
+}
+
 func runSAT(args []string, stdout, stderr io.Writer) int {
 	if len(args) == 0 {
 		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
@@ -462,6 +531,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

+func runBeeWorker(args []string, stdout, stderr io.Writer) int {
+	fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
+	taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
+	fs.Usage = func() {
+		fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fs.Usage()
+		return 2
+	}
+	return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
+}
+
 func parseBenchmarkIndexCSV(raw string) ([]int, error) {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
--- a/audit/go.mod
+++ b/audit/go.mod
@@ -5,22 +5,18 @@ go 1.25.0
 replace reanimator/chart => ../internal/chart

 require (
-	github.com/go-analyze/charts v0.5.26
+	modernc.org/sqlite v1.48.0
 	reanimator/chart v0.0.0-00010101000000-000000000000
 )

 require (
 	github.com/dustin/go-humanize v1.0.1 // indirect
-	github.com/go-analyze/bulk v0.1.3 // indirect
-	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
-	golang.org/x/image v0.24.0 // indirect
 	golang.org/x/sys v0.42.0 // indirect
-	modernc.org/libc v1.70.0 // indirect
+	modernc.org/libc v1.72.0 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
-	modernc.org/sqlite v1.48.0 // indirect
 )
--- a/audit/go.sum
+++ b/audit/go.sum
@@ -1,37 +1,51 @@
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
-github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
-github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
-github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
+github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
 github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
-github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
-github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
-golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
-golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
+golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
+golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
 golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
-modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
+golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
+modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
+modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
+modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
+modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
+modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
+modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
+modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
+modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
+modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
+modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
+modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
+modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
+modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
+modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
 modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
 modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
 modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
 modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
+modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
+modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
+modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
+modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
 modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
 modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
+modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
+modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
+modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
+modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
--- a/audit/internal/app/app_format.go
+++ b/audit/internal/app/app_format.go
@@ -0,0 +1,405 @@
+package app
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+
+	"bee/audit/internal/collector"
+	"bee/audit/internal/platform"
+	"bee/audit/internal/schema"
+)
+
+func hostnameOr(fallback string) string {
+	hn, err := os.Hostname()
+	if err != nil || strings.TrimSpace(hn) == "" {
+		return fallback
+	}
+	return hn
+}
+
+func sanitizeFilename(v string) string {
+	var out []rune
+	for _, r := range v {
+		switch {
+		case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.':
+			out = append(out, r)
+		default:
+			out = append(out, '-')
+		}
+	}
+	if len(out) == 0 {
+		return "unknown"
+	}
+	return string(out)
+}
+
+func bodyOr(body, fallback string) string {
+	body = strings.TrimSpace(body)
+	if body == "" {
+		return fallback
+	}
+	return body
+}
+
+func trimPtr(value *string) string {
+	if value == nil {
+		return ""
+	}
+	return strings.TrimSpace(*value)
+}
+
+func joinSortedKeys(values map[string]struct{}) string {
+	if len(values) == 0 {
+		return ""
+	}
+	keys := make([]string, 0, len(values))
+	for key := range values {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	return strings.Join(keys, "/")
+}
+
+func humanizeMB(totalMB int) string {
+	if totalMB <= 0 {
+		return ""
+	}
+	gb := float64(totalMB) / 1024.0
+	if gb >= 1024.0 {
+		tb := gb / 1024.0
+		return fmt.Sprintf("%.1f TB", tb)
+	}
+	if gb == float64(int64(gb)) {
+		return fmt.Sprintf("%.0f GB", gb)
+	}
+	return fmt.Sprintf("%.1f GB", gb)
+}
+
+func humanizeGB(totalGB int) string {
+	if totalGB <= 0 {
+		return ""
+	}
+	tb := float64(totalGB) / 1024.0
+	if tb >= 1.0 {
+		return fmt.Sprintf("%.1f TB", tb)
+	}
+	return fmt.Sprintf("%d GB", totalGB)
+}
+
+func parseKeyValueSummary(raw string) map[string]string {
+	out := map[string]string{}
+	for _, line := range strings.Split(raw, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		key, value, ok := strings.Cut(line, "=")
+		if !ok {
+			continue
+		}
+		out[strings.TrimSpace(key)] = strings.TrimSpace(value)
+	}
+	return out
+}
+
+func firstNonEmpty(values ...string) string {
+	for _, value := range values {
+		value = strings.TrimSpace(value)
+		if value != "" {
+			return value
+		}
+	}
+	return ""
+}
+
+func cleanSummaryKey(key string) string {
+	idx := strings.Index(key, "-")
+	if idx <= 0 {
+		return key
+	}
+	prefix := key[:idx]
+	for _, c := range prefix {
+		if c < '0' || c > '9' {
+			return key
+		}
+	}
+	return key[idx+1:]
+}
+
+func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
+	// Exclude Aspeed BMC VGA adapters (not compute GPUs).
+	if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID {
+		return false
+	}
+	class := trimPtr(dev.DeviceClass)
+	// AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController.
+	// Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID.
+	if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" {
+		return true
+	}
+	// NVIDIA devices sometimes expose class values outside the standard GPU set.
+	return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
+}
+
+func formatSystemLine(board schema.HardwareBoard) string {
+	model := strings.TrimSpace(strings.Join([]string{
+		trimPtr(board.Manufacturer),
+		trimPtr(board.ProductName),
+	}, " "))
+	serial := strings.TrimSpace(board.SerialNumber)
+	switch {
+	case model != "" && serial != "":
+		return fmt.Sprintf("System: %s | S/N %s", model, serial)
+	case model != "":
+		return "System: " + model
+	case serial != "":
+		return "System S/N: " + serial
+	default:
+		return ""
+	}
+}
+
+func formatCPULine(cpus []schema.HardwareCPU) string {
+	if len(cpus) == 0 {
+		return ""
+	}
+	modelCounts := map[string]int{}
+	unknown := 0
+	for _, cpu := range cpus {
+		model := trimPtr(cpu.Model)
+		if model == "" {
+			unknown++
+			continue
+		}
+		modelCounts[model]++
+	}
+	if len(modelCounts) == 1 && unknown == 0 {
+		for model, count := range modelCounts {
+			return fmt.Sprintf("CPU: %d x %s", count, model)
+		}
+	}
+	parts := make([]string, 0, len(modelCounts)+1)
+	if len(modelCounts) > 0 {
+		keys := make([]string, 0, len(modelCounts))
+		for key := range modelCounts {
+			keys = append(keys, key)
+		}
+		sort.Strings(keys)
+		for _, key := range keys {
+			parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
+		}
+	}
+	if unknown > 0 {
+		parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
+	}
+	return "CPU: " + strings.Join(parts, ", ")
+}
+
+func formatMemoryLine(dimms []schema.HardwareMemory) string {
+	totalMB := 0
+	present := 0
+	types := map[string]struct{}{}
+	for _, dimm := range dimms {
+		if dimm.Present != nil && !*dimm.Present {
+			continue
+		}
+		if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
+			continue
+		}
+		present++
+		totalMB += *dimm.SizeMB
+		if value := trimPtr(dimm.Type); value != "" {
+			types[value] = struct{}{}
+		}
+	}
+	if totalMB == 0 {
+		return ""
+	}
+	typeText := joinSortedKeys(types)
+	line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
+	if typeText != "" {
+		line += " " + typeText
+	}
+	if present > 0 {
+		line += fmt.Sprintf(" (%d DIMMs)", present)
+	}
+	return line
+}
+
+func formatStorageLine(disks []schema.HardwareStorage) string {
+	count := 0
+	totalGB := 0
+	for _, disk := range disks {
+		if disk.Present != nil && !*disk.Present {
+			continue
+		}
+		count++
+		if disk.SizeGB != nil && *disk.SizeGB > 0 {
+			totalGB += *disk.SizeGB
+		}
+	}
+	if count == 0 {
+		return ""
+	}
+	line := fmt.Sprintf("Storage: %d drives", count)
+	if totalGB > 0 {
+		line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
+	}
+	return line
+}
+
+func formatGPULine(devices []schema.HardwarePCIeDevice) string {
+	gpus := map[string]int{}
+	for _, dev := range devices {
+		if !isGPUDevice(dev) {
+			continue
+		}
+		name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
+		gpus[name]++
+	}
+	if len(gpus) == 0 {
+		return ""
+	}
+	keys := make([]string, 0, len(gpus))
+	for key := range gpus {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	parts := make([]string, 0, len(keys))
+	for _, key := range keys {
+		parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
+	}
+	return "GPU: " + strings.Join(parts, ", ")
+}
+
+func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
+	if list == nil {
+		return ""
+	}
+	ifaces, err := list()
+	if err != nil {
+		return ""
+	}
+	seen := map[string]struct{}{}
+	var ips []string
+	for _, iface := range ifaces {
+		for _, ip := range iface.IPv4 {
+			ip = strings.TrimSpace(ip)
+			if ip == "" {
+				continue
+			}
+			if _, ok := seen[ip]; ok {
+				continue
+			}
+			seen[ip] = struct{}{}
+			ips = append(ips, ip)
+		}
+	}
+	if len(ips) == 0 {
+		return ""
+	}
+	sort.Strings(ips)
+	return "IP: " + strings.Join(ips, ", ")
+}
+
+func formatSATDetail(raw string) string {
+	var b strings.Builder
+	kv := parseKeyValueSummary(raw)
+
+	if t, ok := kv["run_at_utc"]; ok {
+		fmt.Fprintf(&b, "Run: %s\n\n", t)
+	}
+
+	lines := strings.Split(raw, "\n")
+	var stepKeys []string
+	seenStep := map[string]bool{}
+	for _, line := range lines {
+		if idx := strings.Index(line, "_status="); idx >= 0 {
+			key := line[:idx]
+			if !seenStep[key] && key != "overall" {
+				seenStep[key] = true
+				stepKeys = append(stepKeys, key)
+			}
+		}
+	}
+
+	for _, key := range stepKeys {
+		status := kv[key+"_status"]
+		display := cleanSummaryKey(key)
+		switch status {
+		case "OK":
+			fmt.Fprintf(&b, "PASS  %s\n", display)
+		case "FAILED":
+			fmt.Fprintf(&b, "FAIL  %s\n", display)
+		case "UNSUPPORTED":
+			fmt.Fprintf(&b, "SKIP  %s\n", display)
+		default:
+			fmt.Fprintf(&b, "?     %s\n", display)
+		}
+	}
+
+	if overall, ok := kv["overall_status"]; ok {
+		ok2 := kv["job_ok"]
+		failed := kv["job_failed"]
+		fmt.Fprintf(&b, "\nOverall: %s  (ok=%s  failed=%s)", overall, ok2, failed)
+	}
+
+	return strings.TrimSpace(b.String())
+}
+
+func formatSATSummary(label, raw string) string {
+	values := parseKeyValueSummary(raw)
+	var body strings.Builder
+	fmt.Fprintf(&body, "%s:", label)
+	if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
+		fmt.Fprintf(&body, " %s", overall)
+	}
+	if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
+		fmt.Fprintf(&body, " ok=%s", ok)
+	}
+	if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
+		fmt.Fprintf(&body, " failed=%s", failed)
+	}
+	if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
+		fmt.Fprintf(&body, " unsupported=%s", unsupported)
+	}
+	if devices := strings.TrimSpace(values["devices"]); devices != "" {
+		fmt.Fprintf(&body, "\nDevices: %s", devices)
+	}
+	return body.String()
+}
+
+func latestSATSummaries() []string {
+	patterns := []struct {
+		label  string
+		prefix string
+	}{
+		{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
+		{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
+		{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
+		{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
+		{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
+		{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
+		{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
+		{label: "Memory SAT", prefix: "memory-"},
+		{label: "Storage SAT", prefix: "storage-"},
+		{label: "CPU SAT", prefix: "cpu-"},
+	}
+	var out []string
+	for _, item := range patterns {
+		matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
+		if err != nil || len(matches) == 0 {
+			continue
+		}
+		sort.Strings(matches)
+		raw, err := os.ReadFile(matches[len(matches)-1])
+		if err != nil {
+			continue
+		}
+		out = append(out, formatSATSummary(item.label, string(raw)))
+	}
+	return out
+}
--- a/audit/internal/app/app_install.go
+++ b/audit/internal/app/app_install.go
@@ -0,0 +1,76 @@
+package app
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) {
+	return a.exports.ListRemovableTargets()
+}
+
+func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) {
+	if _, err := os.Stat(DefaultAuditJSONPath); err != nil {
+		return "", err
+	}
+	filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
+	tmpPath := filepath.Join(os.TempDir(), filename)
+	data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
+	if err != nil {
+		return "", err
+	}
+	if normalized, normErr := ApplySATOverlay(data); normErr == nil {
+		data = normalized
+	}
+	if err := os.WriteFile(tmpPath, data, 0644); err != nil {
+		return "", err
+	}
+	defer os.Remove(tmpPath)
+	return a.exports.ExportFileToTarget(tmpPath, target)
+}
+
+func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
+	path, err := a.ExportLatestAudit(target)
+	body := "Audit export failed."
+	if err == nil {
+		body = "Audit exported."
+	}
+	if err == nil && path != "" {
+		body = "Audit exported to " + path
+	}
+	return ActionResult{Title: "Export audit", Body: body}, err
+}
+
+func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
+	archive, err := BuildSupportBundle(DefaultExportDir)
+	if err != nil {
+		return "", err
+	}
+	defer os.Remove(archive)
+	return a.exports.ExportFileToTarget(archive, target)
+}
+
+func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
+	path, err := a.ExportSupportBundle(target)
+	body := "Support bundle export failed."
+	if err == nil {
+		body = "Support bundle exported. USB target unmounted and safe to remove."
+	}
+	if err == nil && path != "" {
+		body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
+	}
+	return ActionResult{Title: "Export support bundle", Body: body}, err
+}
+
+func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
+	return a.installer.ListInstallDisks()
+}
+
+func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
+	return a.installer.InstallToDisk(ctx, device, logFile)
+}
--- a/audit/internal/app/app_network.go
+++ b/audit/internal/app/app_network.go
@@ -0,0 +1,106 @@
+package app
+
+import (
+	"fmt"
+	"strings"
+
+	"bee/audit/internal/platform"
+)
+
+func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
+	return a.network.ListInterfaces()
+}
+
+func (a *App) DefaultRoute() string {
+	return a.network.DefaultRoute()
+}
+
+func (a *App) DHCPOne(iface string) (string, error) {
+	return a.network.DHCPOne(iface)
+}
+
+func (a *App) DHCPOneResult(iface string) (ActionResult, error) {
+	body, err := a.network.DHCPOne(iface)
+	return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err
+}
+
+func (a *App) DHCPAll() (string, error) {
+	return a.network.DHCPAll()
+}
+
+func (a *App) DHCPAllResult() (ActionResult, error) {
+	body, err := a.network.DHCPAll()
+	return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err
+}
+
+func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
+	return a.network.SetStaticIPv4(cfg)
+}
+
+func (a *App) SetInterfaceState(iface string, up bool) error {
+	return a.network.SetInterfaceState(iface, up)
+}
+
+func (a *App) GetInterfaceState(iface string) (bool, error) {
+	return a.network.GetInterfaceState(iface)
+}
+
+func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
+	return a.network.CaptureNetworkSnapshot()
+}
+
+func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
+	return a.network.RestoreNetworkSnapshot(snapshot)
+}
+
+func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
+	body, err := a.network.SetStaticIPv4(cfg)
+	return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
+}
+
+func (a *App) NetworkStatus() (ActionResult, error) {
+	ifaces, err := a.network.ListInterfaces()
+	if err != nil {
+		return ActionResult{Title: "Network status"}, err
+	}
+	if len(ifaces) == 0 {
+		return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil
+	}
+	var body strings.Builder
+	for _, iface := range ifaces {
+		ipv4 := "(no IPv4)"
+		if len(iface.IPv4) > 0 {
+			ipv4 = strings.Join(iface.IPv4, ", ")
+		}
+		fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4)
+	}
+	if gw := a.network.DefaultRoute(); gw != "" {
+		fmt.Fprintf(&body, "\nDefault route: %s\n", gw)
+	}
+	return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil
+}
+
+func (a *App) DefaultStaticIPv4FormFields(iface string) []string {
+	return []string{
+		"",
+		"24",
+		strings.TrimSpace(a.network.DefaultRoute()),
+		"77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8",
+	}
+}
+
+func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config {
+	get := func(index int) string {
+		if index >= 0 && index < len(fields) {
+			return strings.TrimSpace(fields[index])
+		}
+		return ""
+	}
+	return platform.StaticIPv4Config{
+		Interface: iface,
+		Address:   get(0),
+		Prefix:    get(1),
+		Gateway:   get(2),
+		DNS:       strings.Fields(get(3)),
+	}
+}
--- a/audit/internal/app/app_packs.go
+++ b/audit/internal/app/app_packs.go
@@ -0,0 +1,370 @@
+package app
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"bee/audit/internal/platform"
+)
+
+func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
+}
+
+func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
+	path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
+	body := "Archive written."
+	if path != "" {
+		body = "Archive written to " + path
+	}
+	return ActionResult{Title: "NVIDIA SAT", Body: body}, err
+}
+
+func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
+	return a.sat.ListNvidiaGPUs()
+}
+
+func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
+	return a.sat.ListNvidiaGPUStatuses()
+}
+
+func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
+	out, err := a.sat.ResetNvidiaGPU(index)
+	return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
+}
+
+func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
+	body := "Archive written."
+	if path != "" {
+		body = "Archive written to " + path
+	}
+	return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
+}
+
+func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBeeBenchPerfDir
+	}
+	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
+	if err != nil {
+		return "", err
+	}
+	opts.ServerPowerSource = resolved.SelectedSource
+	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBeeBenchPowerDir
+	}
+	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
+	if err != nil {
+		return "", err
+	}
+	opts.ServerPowerSource = resolved.SelectedSource
+	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBeeBenchAutotuneDir
+	}
+	return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
+}
+
+func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
+	return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
+}
+
+func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
+	cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
+	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
+		if logFunc != nil {
+			logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
+		}
+		return *cfg, nil
+	}
+	if logFunc != nil {
+		logFunc("benchmark autotune: no saved power source config, running autotune first")
+	}
+	autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
+	if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
+		return platform.BenchmarkPowerAutotuneConfig{}, err
+	}
+	cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
+	if err != nil {
+		return platform.BenchmarkPowerAutotuneConfig{}, err
+	}
+	return *cfg, nil
+}
+
+func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
+}
+
+func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
+}
+
+func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
+}
+
+func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
+	path, err := a.RunMemoryAcceptancePack(baseDir, nil)
+	return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
+}
+
+func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
+	path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
+	return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
+}
+
+func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
+}
+
+func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
+}
+
+func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
+	path, err := a.RunStorageAcceptancePack(baseDir, nil)
+	return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
+}
+
+func (a *App) DetectGPUVendor() string {
+	return a.sat.DetectGPUVendor()
+}
+
+func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
+	return a.sat.ListAMDGPUs()
+}
+
+func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
+}
+
+func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
+	path, err := a.RunAMDAcceptancePack(baseDir, nil)
+	return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
+}
+
+func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunFanStressTest(ctx, baseDir, opts)
+}
+
+func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
+}
+
+func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
+	body := "Results: " + path
+	if err != nil && err != context.Canceled {
+		body += "\nERROR: " + err.Error()
+	}
+	return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
+}
+
+func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
+	path, err := a.RunFanStressTest(ctx, "", opts)
+	body := formatFanStressResult(path)
+	if err != nil && err != context.Canceled {
+		body += "\nERROR: " + err.Error()
+	}
+	return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
+}
+
+// formatFanStressResult formats the summary.txt from a fan-stress run, including
+// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
+func formatFanStressResult(archivePath string) string {
+	if archivePath == "" {
+		return "No output produced."
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return "Archive written to " + archivePath
+	}
+	content := strings.TrimSpace(string(raw))
+	kv := parseKeyValueSummary(content)
+
+	var b strings.Builder
+	b.WriteString(formatSATDetail(content))
+
+	// Append analysis section.
+	var analysis []string
+	if v, ok := kv["throttling_detected"]; ok {
+		label := "NO"
+		if v == "true" {
+			label = "YES  ← throttling detected during load"
+		}
+		analysis = append(analysis, "Throttling:   "+label)
+	}
+	if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
+		analysis = append(analysis, "Max GPU temp: "+v+"°C")
+	}
+	if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
+		analysis = append(analysis, "Max CPU temp: "+v+"°C")
+	}
+	if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
+		analysis = append(analysis, "Fan response: "+v+"s")
+	}
+
+	if len(analysis) > 0 {
+		b.WriteString("\n\n=== Analysis ===\n")
+		for _, line := range analysis {
+			b.WriteString(line + "\n")
+		}
+	}
+	return strings.TrimSpace(b.String())
+}
+
+// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
+// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
+func satResultBody(archivePath string) string {
+	if archivePath == "" {
+		return "No output produced."
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return "Archive written to " + archivePath
+	}
+	return formatSATDetail(strings.TrimSpace(string(raw)))
+}
--- a/audit/internal/app/app_services.go
+++ b/audit/internal/app/app_services.go
@@ -0,0 +1,67 @@
+package app
+
+import (
+	"fmt"
+	"strings"
+
+	"bee/audit/internal/platform"
+)
+
+func (a *App) ListBeeServices() ([]string, error) {
+	return a.services.ListBeeServices()
+}
+
+func (a *App) ServiceState(name string) string {
+	return a.services.ServiceState(name)
+}
+
+func (a *App) ServiceStatus(name string) (string, error) {
+	return a.services.ServiceStatus(name)
+}
+
+func (a *App) ServiceStatusResult(name string) (ActionResult, error) {
+	body, err := a.services.ServiceStatus(name)
+	return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err
+}
+
+func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) {
+	return a.services.ServiceDo(name, action)
+}
+
+func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) {
+	body, err := a.services.ServiceDo(name, action)
+	return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err
+}
+
+func (a *App) TailFile(path string, lines int) string {
+	return a.tools.TailFile(path, lines)
+}
+
+func (a *App) CheckTools(names []string) []platform.ToolStatus {
+	return a.tools.CheckTools(names)
+}
+
+func (a *App) ToolCheckResult(names []string) ActionResult {
+	if len(names) == 0 {
+		return ActionResult{Title: "Required tools", Body: "No tools checked."}
+	}
+	var body strings.Builder
+	for _, tool := range a.tools.CheckTools(names) {
+		status := "MISSING"
+		if tool.OK {
+			status = "OK (" + tool.Path + ")"
+		}
+		fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status)
+	}
+	return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())}
+}
+
+func (a *App) AuditLogTailResult() ActionResult {
+	logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40))
+	jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20))
+	body := strings.TrimSpace(logTail + "\n\n" + jsonTail)
+	if body == "" {
+		body = "No audit logs found."
+	}
+	return ActionResult{Title: "Audit log tail", Body: body}
+}
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -9,6 +9,7 @@ import (
 	"io"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"

 	"bee/audit/internal/platform"
@@ -123,11 +124,13 @@ type fakeSAT struct {
 	runNvidiaFn               func(string) (string, error)
 	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaAutotuneFn       func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
 	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
 	runNvidiaComputeFn        func(string, int, []int) (string, error)
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
 	runNvidiaPulseFn          func(string, int, []int) (string, error)
 	runNvidiaBandwidthFn      func(string, []int) (string, error)
+	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
@@ -162,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
 	return f.runNvidiaFn(baseDir)
 }

+func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
+	if f.runNvidiaAutotuneFn != nil {
+		return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -287,10 +297,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }

-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNCCLFn != nil {
+		return f.runNCCLFn(baseDir, gpuIndices)
+	}
 	return "", nil
 }

+func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
+	t.Parallel()
+
+	var gotBaseDir string
+	var gotGPUIndices []int
+	a := &App{
+		sat: fakeSAT{
+			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
+				gotBaseDir = baseDir
+				gotGPUIndices = append([]int(nil), gpuIndices...)
+				return "/tmp/nccl-tests.tar.gz", nil
+			},
+		},
+	}
+
+	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
+	if err != nil {
+		t.Fatalf("RunNCCLTests error: %v", err)
+	}
+	if path != "/tmp/nccl-tests.tar.gz" {
+		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
+	}
+	if gotBaseDir != "/tmp/sat" {
+		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
+	}
+	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
+		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
+	}
+}
+
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()

@@ -775,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
 		t.Fatal(err)
 	}
+	if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
+		t.Fatal(err)
+	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -802,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tr := tar.NewReader(gzr)
 	var names []string
 	var auditJSON string
+	var manifest string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -818,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			}
 			auditJSON = string(body)
 		}
+		if strings.HasSuffix(hdr.Name, "/manifest.txt") {
+			body, err := io.ReadAll(tr)
+			if err != nil {
+				t.Fatalf("read manifest entry: %v", err)
+			}
+			manifest = string(body)
+		}
 	}

 	for _, want := range []string{
@@ -861,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
 		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
 	}
+	if !contains(manifest, "files:") {
+		t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
+	}
+	if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
+		t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
+	}
 }

 func TestMainBanner(t *testing.T) {
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -2,10 +2,29 @@ package app

 import (
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 )

+// readFileLimited reads path into memory, refusing files larger than maxBytes.
+// Prevents OOM on corrupted or unexpectedly large data files.
+func readFileLimited(path string, maxBytes int64) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
+	if err != nil {
+		return nil, err
+	}
+	if int64(len(data)) > maxBytes {
+		return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
+	}
+	return data, nil
+}
+
 func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
--- a/audit/internal/app/blackbox.go
+++ b/audit/internal/app/blackbox.go
@@ -0,0 +1,779 @@
+package app
+
+import (
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+const (
+	blackboxMarkerName        = ".bee-blackbox"
+	blackboxDiscoverInterval  = 2 * time.Second
+	blackboxMinFlushPeriod    = 1 * time.Second
+	blackboxMaxFlushPeriod    = 30 * time.Second
+	blackboxRecoveryFastCount = 5
+)
+
+var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
+
+var (
+	blackboxExecCommand = exec.Command
+	blackboxNow         = func() time.Time { return time.Now().UTC() }
+)
+
+type BlackboxMarker struct {
+	Version      int    `json:"version"`
+	EnrollmentID string `json:"enrollment_id"`
+	CreatedAtUTC string `json:"created_at_utc"`
+	Host         string `json:"host,omitempty"`
+}
+
+type BlackboxTargetStatus struct {
+	EnrollmentID      string                   `json:"enrollment_id"`
+	Device            string                   `json:"device"`
+	FS                platform.RemovableTarget `json:"fs"`
+	BootFolder        string                   `json:"boot_folder"`
+	Status            string                   `json:"status"`
+	LastSyncAtUTC     string                   `json:"last_sync_at_utc,omitempty"`
+	LastCycleDuration string                   `json:"last_cycle_duration,omitempty"`
+	FlushPeriod       string                   `json:"flush_period"`
+	LastError         string                   `json:"last_error,omitempty"`
+	Mountpoint        string                   `json:"mountpoint,omitempty"`
+}
+
+type BlackboxState struct {
+	Status           string                 `json:"status"`
+	BootStartedAtUTC string                 `json:"boot_started_at_utc"`
+	BootFolder       string                 `json:"boot_folder"`
+	UpdatedAtUTC     string                 `json:"updated_at_utc"`
+	Targets          []BlackboxTargetStatus `json:"targets"`
+}
+
+type blackboxRuntime struct {
+	exportDir   string
+	statePath   string
+	system      *platform.System
+	bootStarted time.Time
+	bootFolder  string
+
+	mu      sync.Mutex
+	workers map[string]*blackboxWorker
+}
+
+type discoveredBlackboxTarget struct {
+	marker       BlackboxMarker
+	target       platform.RemovableTarget
+	seenMount    string
+	mountedByBee bool
+}
+
+type blackboxWorker struct {
+	runtime      *blackboxRuntime
+	enrollmentID string
+
+	mu           sync.Mutex
+	target       platform.RemovableTarget
+	marker       BlackboxMarker
+	mountpoint   string
+	mountedByBee bool
+	status       string
+	lastSyncAt   time.Time
+	lastDuration time.Duration
+	flushPeriod  time.Duration
+	lastError    string
+	fastCycles   int
+	stopCh       chan struct{}
+	stoppedCh    chan struct{}
+}
+
+func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
+	exportDir = strings.TrimSpace(exportDir)
+	if exportDir == "" {
+		exportDir = DefaultExportDir
+	}
+	statePath = strings.TrimSpace(statePath)
+	if statePath == "" {
+		statePath = DefaultBlackboxStatePath
+	}
+	if system == nil {
+		system = platform.New()
+	}
+	bootStarted, err := bootStartedAtUTC()
+	if err != nil {
+		bootStarted = blackboxNow()
+	}
+	rt := &blackboxRuntime{
+		exportDir:   exportDir,
+		statePath:   statePath,
+		system:      system,
+		bootStarted: bootStarted,
+		bootFolder:  SupportBundleBaseName(bootStarted),
+		workers:     make(map[string]*blackboxWorker),
+	}
+	_ = os.MkdirAll(filepath.Dir(statePath), 0755)
+	rt.persistState()
+	ticker := time.NewTicker(blackboxDiscoverInterval)
+	defer ticker.Stop()
+	for {
+		rt.reconcile()
+		select {
+		case <-ctx.Done():
+			rt.stopAll()
+			return ctx.Err()
+		case <-ticker.C:
+		}
+	}
+}
+
+func ReadBlackboxState(path string) (BlackboxState, error) {
+	path = strings.TrimSpace(path)
+	if path == "" {
+		path = DefaultBlackboxStatePath
+	}
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return BlackboxState{}, err
+	}
+	var state BlackboxState
+	if err := json.Unmarshal(raw, &state); err != nil {
+		return BlackboxState{}, err
+	}
+	return state, nil
+}
+
+func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
+	target = sanitizeRemovableTarget(target)
+	if target.Device == "" {
+		return BlackboxMarker{}, fmt.Errorf("device is required")
+	}
+	mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
+	if err != nil {
+		return BlackboxMarker{}, err
+	}
+	defer func() {
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+	}()
+
+	marker, _, err := readBlackboxMarker(mountpoint)
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return BlackboxMarker{}, err
+	}
+	if marker.EnrollmentID == "" {
+		marker = BlackboxMarker{
+			Version:      1,
+			EnrollmentID: newBlackboxEnrollmentID(),
+			CreatedAtUTC: blackboxNow().Format(time.RFC3339),
+			Host:         hostnameOr("unknown"),
+		}
+	}
+	if err := writeBlackboxMarker(mountpoint, marker); err != nil {
+		return BlackboxMarker{}, err
+	}
+	return marker, nil
+}
+
+func DisableBlackboxTarget(device, enrollmentID string) error {
+	device = strings.TrimSpace(device)
+	enrollmentID = strings.TrimSpace(enrollmentID)
+	if device == "" && enrollmentID == "" {
+		return fmt.Errorf("device or enrollment_id is required")
+	}
+	system := platform.New()
+	targets, err := system.ListRemovableTargets()
+	if err != nil {
+		return err
+	}
+	for _, target := range targets {
+		target = sanitizeRemovableTarget(target)
+		mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
+		if mountErr != nil {
+			continue
+		}
+		remove := false
+		marker, _, err := readBlackboxMarker(mountpoint)
+		if err == nil {
+			if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
+				remove = true
+			}
+			if device != "" && target.Device == device {
+				remove = true
+			}
+		}
+		if remove {
+			err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
+		}
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+		if remove {
+			return err
+		}
+	}
+	return os.ErrNotExist
+}
+
+func (rt *blackboxRuntime) reconcile() {
+	discovered, _ := rt.discoverMarkedTargets()
+
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	seen := make(map[string]struct{}, len(discovered))
+	for _, found := range discovered {
+		seen[found.marker.EnrollmentID] = struct{}{}
+		worker, ok := rt.workers[found.marker.EnrollmentID]
+		if !ok {
+			worker = newBlackboxWorker(rt, found)
+			rt.workers[found.marker.EnrollmentID] = worker
+			go worker.run()
+			continue
+		}
+		worker.update(found)
+	}
+	for id, worker := range rt.workers {
+		if _, ok := seen[id]; ok {
+			continue
+		}
+		worker.stop()
+		delete(rt.workers, id)
+	}
+	rt.persistStateLocked()
+}
+
+func (rt *blackboxRuntime) stopAll() {
+	rt.mu.Lock()
+	workers := make([]*blackboxWorker, 0, len(rt.workers))
+	for _, worker := range rt.workers {
+		workers = append(workers, worker)
+	}
+	rt.workers = map[string]*blackboxWorker{}
+	rt.persistStateLocked()
+	rt.mu.Unlock()
+	for _, worker := range workers {
+		worker.stop()
+	}
+}
+
+func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
+	targets, err := rt.system.ListRemovableTargets()
+	if err != nil {
+		return nil, err
+	}
+	var out []discoveredBlackboxTarget
+	for _, rawTarget := range targets {
+		target := sanitizeRemovableTarget(rawTarget)
+		if target.Device == "" {
+			continue
+		}
+		mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
+		if err != nil {
+			continue
+		}
+		marker, ok, err := readBlackboxMarker(mountpoint)
+		if mountedByBee && !ok {
+			_ = unmountTarget(mountpoint)
+		}
+		if err != nil || !ok || marker.EnrollmentID == "" {
+			continue
+		}
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+		out = append(out, discoveredBlackboxTarget{
+			marker:       marker,
+			target:       target,
+			seenMount:    mountpoint,
+			mountedByBee: mountedByBee,
+		})
+	}
+	sort.Slice(out, func(i, j int) bool {
+		return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
+	})
+	return out, nil
+}
+
+func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
+	return &blackboxWorker{
+		runtime:      rt,
+		enrollmentID: found.marker.EnrollmentID,
+		target:       found.target,
+		marker:       found.marker,
+		flushPeriod:  blackboxMinFlushPeriod,
+		status:       "running",
+		stopCh:       make(chan struct{}),
+		stoppedCh:    make(chan struct{}),
+	}
+}
+
+func (w *blackboxWorker) run() {
+	defer close(w.stoppedCh)
+	for {
+		start := time.Now()
+		err := w.syncCycle()
+		duration := time.Since(start)
+		w.finishCycle(duration, err)
+
+		wait := w.currentFlushPeriod()
+		timer := time.NewTimer(wait)
+		select {
+		case <-w.stopCh:
+			timer.Stop()
+			w.cleanup()
+			return
+		case <-timer.C:
+		}
+	}
+}
+
+func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.target = found.target
+	w.marker = found.marker
+}
+
+func (w *blackboxWorker) stop() {
+	select {
+	case <-w.stopCh:
+	default:
+		close(w.stopCh)
+	}
+	<-w.stoppedCh
+}
+
+func (w *blackboxWorker) currentFlushPeriod() time.Duration {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.flushPeriod
+}
+
+func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.lastDuration = duration
+	if err != nil {
+		w.status = "degraded"
+		w.lastError = err.Error()
+		w.fastCycles = 0
+		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
+	} else {
+		w.status = "running"
+		w.lastSyncAt = blackboxNow()
+		w.lastError = ""
+		if duration <= w.flushPeriod/2 {
+			w.fastCycles++
+		} else {
+			w.fastCycles = 0
+		}
+		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
+	}
+	w.runtime.persistState()
+}
+
+func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
+	if current <= 0 {
+		current = blackboxMinFlushPeriod
+	}
+	if duration <= 0 {
+		duration = current
+	}
+	next := current
+	if duration > current {
+		growA := time.Duration(float64(current) * 1.25)
+		growB := time.Duration(float64(duration) * 1.25)
+		if growB > growA {
+			next = growB
+		} else {
+			next = growA
+		}
+	}
+	if success && fastCycles >= blackboxRecoveryFastCount {
+		next = time.Duration(float64(current) * 0.9)
+	}
+	if next < blackboxMinFlushPeriod {
+		next = blackboxMinFlushPeriod
+	}
+	if next > blackboxMaxFlushPeriod {
+		next = blackboxMaxFlushPeriod
+	}
+	return next
+}
+
+func (w *blackboxWorker) syncCycle() error {
+	target, marker := w.snapshotTarget()
+	mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
+	if err != nil {
+		return err
+	}
+	w.recordMountpoint(mountpoint, mountedByBee)
+
+	root := filepath.Join(mountpoint, w.runtime.bootFolder)
+	if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
+		return err
+	}
+	if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
+		return err
+	}
+	if err := w.captureSnapshots(root); err != nil {
+		return err
+	}
+	return syncFilesystem(root)
+}
+
+func (w *blackboxWorker) cleanup() {
+	w.mu.Lock()
+	mountpoint := w.mountpoint
+	mountedByBee := w.mountedByBee
+	w.mu.Unlock()
+	if mountedByBee && mountpoint != "" {
+		_ = unmountTarget(mountpoint)
+	}
+}
+
+func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.target, w.marker
+}
+
+func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.mountpoint = mountpoint
+	w.mountedByBee = mountedByBee
+}
+
+func (w *blackboxWorker) captureSnapshots(root string) error {
+	if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
+		return err
+	}
+	for _, svc := range supportBundleServices {
+		if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
+			return err
+		}
+		if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
+			return err
+		}
+	}
+	if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
+		return err
+	}
+	for _, item := range supportBundleOptionalFiles {
+		if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
+			return err
+		}
+	}
+	return nil
+}
+
+func (rt *blackboxRuntime) persistState() {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+	rt.persistStateLocked()
+}
+
+func (rt *blackboxRuntime) persistStateLocked() {
+	state := BlackboxState{
+		Status:           "disabled",
+		BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
+		BootFolder:       rt.bootFolder,
+		UpdatedAtUTC:     blackboxNow().Format(time.RFC3339),
+		Targets:          make([]BlackboxTargetStatus, 0, len(rt.workers)),
+	}
+	if len(rt.workers) > 0 {
+		state.Status = "running"
+	}
+	for _, worker := range rt.workers {
+		worker.mu.Lock()
+		targetState := BlackboxTargetStatus{
+			EnrollmentID: worker.enrollmentID,
+			Device:       worker.target.Device,
+			FS:           worker.target,
+			BootFolder:   rt.bootFolder,
+			Status:       worker.status,
+			FlushPeriod:  worker.flushPeriod.String(),
+			LastError:    worker.lastError,
+			Mountpoint:   worker.mountpoint,
+		}
+		if !worker.lastSyncAt.IsZero() {
+			targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
+		}
+		if worker.lastDuration > 0 {
+			targetState.LastCycleDuration = worker.lastDuration.String()
+		}
+		if worker.status == "degraded" {
+			state.Status = "degraded"
+		}
+		worker.mu.Unlock()
+		state.Targets = append(state.Targets, targetState)
+	}
+	sort.Slice(state.Targets, func(i, j int) bool {
+		return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
+	})
+	_ = writeJSONAtomic(rt.statePath, state)
+}
+
+func bootStartedAtUTC() (time.Time, error) {
+	raw, err := os.ReadFile("/proc/stat")
+	if err != nil {
+		return time.Time{}, err
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		if !strings.HasPrefix(line, "btime ") {
+			continue
+		}
+		parts := strings.Fields(line)
+		if len(parts) != 2 {
+			break
+		}
+		sec, err := time.ParseDuration(parts[1] + "s")
+		if err != nil {
+			break
+		}
+		return time.Unix(int64(sec/time.Second), 0).UTC(), nil
+	}
+	return time.Time{}, fmt.Errorf("boot time not found")
+}
+
+func newBlackboxEnrollmentID() string {
+	var buf [8]byte
+	if _, err := rand.Read(buf[:]); err != nil {
+		return fmt.Sprintf("bb-%d", time.Now().UnixNano())
+	}
+	return "bb-" + hex.EncodeToString(buf[:])
+}
+
+func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
+	target.Device = strings.TrimSpace(target.Device)
+	target.FSType = strings.TrimSpace(target.FSType)
+	target.Size = strings.TrimSpace(target.Size)
+	target.Label = strings.TrimSpace(target.Label)
+	target.Model = strings.TrimSpace(target.Model)
+	target.Mountpoint = strings.TrimSpace(target.Mountpoint)
+	return target
+}
+
+func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
+	target = sanitizeRemovableTarget(target)
+	if target.Mountpoint != "" {
+		if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
+			return target.Mountpoint, false, nil
+		}
+	}
+	mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
+	if err := os.MkdirAll(mountpoint, 0755); err != nil {
+		return "", false, err
+	}
+	if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
+		return "", false, formatBlackboxMountTargetError(target, string(raw), err)
+	}
+	if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
+		_ = unmountTarget(mountpoint)
+		return "", false, err
+	}
+	return mountpoint, true, nil
+}
+
+func unmountTarget(mountpoint string) error {
+	_ = blackboxExecCommand("sync").Run()
+	raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
+	if err != nil {
+		msg := strings.TrimSpace(string(raw))
+		if msg == "" {
+			return err
+		}
+		return fmt.Errorf("%s: %w", msg, err)
+	}
+	return nil
+}
+
+func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
+	raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return BlackboxMarker{}, false, os.ErrNotExist
+		}
+		return BlackboxMarker{}, false, err
+	}
+	var marker BlackboxMarker
+	if err := json.Unmarshal(raw, &marker); err != nil {
+		return BlackboxMarker{}, false, err
+	}
+	return marker, true, nil
+}
+
+func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
+	if marker.Version == 0 {
+		marker.Version = 1
+	}
+	return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
+}
+
+func syncDirectoryTree(srcDir, dstDir string) error {
+	seen := make(map[string]struct{})
+	err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		rel, err := filepath.Rel(srcDir, path)
+		if err != nil {
+			return err
+		}
+		rel = filepath.Clean(rel)
+		if rel == "." {
+			seen["."] = struct{}{}
+			return os.MkdirAll(dstDir, 0755)
+		}
+		seen[rel] = struct{}{}
+		dstPath := filepath.Join(dstDir, rel)
+		if d.IsDir() {
+			info, err := d.Info()
+			if err != nil {
+				return err
+			}
+			return os.MkdirAll(dstPath, info.Mode().Perm())
+		}
+		return copyFileIfChanged(path, dstPath)
+	})
+	if err != nil {
+		return err
+	}
+	return removeMissingPaths(dstDir, seen)
+}
+
+func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
+	return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		rel, err := filepath.Rel(dstDir, path)
+		if err != nil {
+			return err
+		}
+		rel = filepath.Clean(rel)
+		if rel == "." {
+			return nil
+		}
+		if _, ok := seen[rel]; ok {
+			return nil
+		}
+		return os.RemoveAll(path)
+	})
+}
+
+func copyFileIfChanged(src, dst string) error {
+	info, err := os.Stat(src)
+	if err != nil {
+		return err
+	}
+	if info.IsDir() {
+		return os.MkdirAll(dst, info.Mode().Perm())
+	}
+	srcData, err := os.ReadFile(src)
+	if err != nil {
+		return err
+	}
+	if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
+		return nil
+	}
+	return writeFileAtomic(dst, srcData, info.Mode().Perm())
+}
+
+func captureCommandAtomic(dst string, name string, args ...string) error {
+	raw, err := blackboxExecCommand(name, args...).CombinedOutput()
+	if len(raw) == 0 {
+		if err != nil {
+			raw = []byte(err.Error() + "\n")
+		} else {
+			raw = []byte("no output\n")
+		}
+	}
+	return writeFileAtomic(dst, raw, 0644)
+}
+
+func writeJSONAtomic(path string, v any) error {
+	raw, err := json.MarshalIndent(v, "", "  ")
+	if err != nil {
+		return err
+	}
+	raw = append(raw, '\n')
+	return writeFileAtomic(path, raw, 0644)
+}
+
+func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
+		return nil
+	}
+	tmp := path + ".tmp"
+	f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
+	if err != nil {
+		return err
+	}
+	if _, err := f.Write(data); err != nil {
+		_ = f.Close()
+		return err
+	}
+	if err := f.Sync(); err != nil {
+		_ = f.Close()
+		return err
+	}
+	if err := f.Close(); err != nil {
+		return err
+	}
+	if err := os.Rename(tmp, path); err != nil {
+		return err
+	}
+	return syncFilesystem(filepath.Dir(path))
+}
+
+func syncFilesystem(path string) error {
+	return blackboxExecCommand("sync").Run()
+}
+
+func ensureWritableBlackboxMountpoint(mountpoint string) error {
+	probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
+	if err != nil {
+		return fmt.Errorf("target filesystem is not writable: %w", err)
+	}
+	name := probe.Name()
+	if closeErr := probe.Close(); closeErr != nil {
+		_ = os.Remove(name)
+		return closeErr
+	}
+	if err := os.Remove(name); err != nil {
+		return err
+	}
+	return nil
+}
+
+func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
+	msg := strings.TrimSpace(raw)
+	fstype := strings.ToLower(strings.TrimSpace(target.FSType))
+	if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
+		return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
+	}
+	if msg == "" {
+		return err
+	}
+	return fmt.Errorf("%s: %w", msg, err)
+}
--- a/audit/internal/app/blackbox_test.go
+++ b/audit/internal/app/blackbox_test.go
@@ -0,0 +1,52 @@
+package app
+
+import (
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
+	current := 2 * time.Second
+	got := adjustFlushPeriod(current, 4*time.Second, false, 0)
+	if got <= current {
+		t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
+	}
+}
+
+func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
+	current := 10 * time.Second
+	got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
+	if got >= current {
+		t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
+	}
+	if got < blackboxMinFlushPeriod {
+		t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
+	}
+}
+
+func TestReadBlackboxState(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "blackbox-state.json")
+	want := BlackboxState{
+		Status:           "running",
+		BootStartedAtUTC: "2026-04-24T00:00:00Z",
+		BootFolder:       "boot-folder",
+		UpdatedAtUTC:     "2026-04-24T00:00:01Z",
+		Targets: []BlackboxTargetStatus{{
+			EnrollmentID: "bb-1",
+			Device:       "/dev/sdb1",
+			Status:       "running",
+			FlushPeriod:  "1s",
+		}},
+	}
+	if err := writeJSONAtomic(path, want); err != nil {
+		t.Fatalf("writeJSONAtomic: %v", err)
+	}
+	got, err := ReadBlackboxState(path)
+	if err != nil {
+		t.Fatalf("ReadBlackboxState: %v", err)
+	}
+	if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
+		t.Fatalf("state=%+v", got)
+	}
+}
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
-	data, err := os.ReadFile(path)
+	data, err := readFileLimited(path, 10<<20)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -3,10 +3,11 @@ package app
 import (
 	"os"
 	"path/filepath"
-	"strconv"
 	"sort"
+	"strconv"
 	"strings"

+	"bee/audit/internal/collector"
 	"bee/audit/internal/schema"
 )

@@ -313,17 +314,20 @@ func statusSeverity(status string) int {
 }

 func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
-	if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
-		if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
-			return false
-		}
+	if dev.DeviceClass == nil {
+		return false
+	}
+	class := strings.TrimSpace(*dev.DeviceClass)
+	isGPUClass := strings.Contains(class, "Controller") || strings.Contains(class, "Accelerator") ||
+		strings.Contains(class, "Display") || strings.Contains(class, "Video")
+	if !isGPUClass {
+		return false
 	}
-	manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
 	switch vendor {
 	case "amd":
-		return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
+		return dev.VendorID != nil && *dev.VendorID == collector.AMDVendorID
 	case "nvidia":
-		return strings.Contains(manufacturer, "nvidia")
+		return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
 	default:
 		return false
 	}
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -5,6 +5,7 @@ import (
 	"path/filepath"
 	"testing"

+	"bee/audit/internal/collector"
 	"bee/audit/internal/schema"
 )

@@ -46,10 +47,12 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {

 	class := "DisplayController"
 	manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
+	amdVendorID := collector.AMDVendorID
 	snap := schema.HardwareSnapshot{
 		PCIeDevices: []schema.HardwarePCIeDevice{{
 			DeviceClass:  &class,
 			Manufacturer: &manufacturer,
+			VendorID:     &amdVendorID,
 		}},
 	}

--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -2,6 +2,7 @@ package app

 import (
 	"archive/tar"
+	"bee/audit/internal/platform"
 	"compress/gzip"
 	"fmt"
 	"io"
@@ -14,6 +15,7 @@ import (
 )

 var supportBundleServices = []string{
+	"bee-blackbox.service",
 	"bee-audit.service",
 	"bee-web.service",
 	"bee-network.service",
@@ -22,6 +24,10 @@ var supportBundleServices = []string{
 	"bee-selfheal.service",
 	"bee-selfheal.timer",
 	"bee-sshsetup.service",
+	"display-manager.service",
+	"lightdm.service",
+	"nvidia-dcgm.service",
+	"nvidia-fabricmanager.service",
 }

 var supportBundleCommands = []struct {
@@ -40,14 +46,167 @@ var supportBundleCommands = []struct {
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
 	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
+	{name: "system/dmesg-gui-video-input.txt", cmd: []string{"sh", "-c", `
+if command -v dmesg >/dev/null 2>&1; then
+  dmesg | grep -iE 'nvidia|drm|fb|framebuffer|vesa|efi|lightdm|Xorg|input|hid|usb|keyboard|mouse|virtual keyboard|virtual mouse|ami|aspeed|ast' || echo "no GUI/video/input kernel messages found"
+else
+  echo "dmesg not found"
+fi
+`}},
 	{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
 if command -v dmesg >/dev/null 2>&1; then
  dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
 else
  echo "dmesg not found"
 fi
+`}},
+	{name: "system/loginctl-sessions.txt", cmd: []string{"sh", "-c", `
+if command -v loginctl >/dev/null 2>&1; then
+  loginctl list-sessions 2>&1 || true
+else
+  echo "loginctl not found"
+fi
+`}},
+	{name: "system/loginctl-seats.txt", cmd: []string{"sh", "-c", `
+if command -v loginctl >/dev/null 2>&1; then
+  loginctl list-seats 2>&1 || true
+  echo
+  for seat in $(loginctl list-seats --no-legend 2>/dev/null | awk '{print $1}'); do
+    echo "=== $seat ==="
+    loginctl seat-status "$seat" 2>&1 || true
+    echo
+  done
+else
+  echo "loginctl not found"
+fi
+`}},
+	{name: "system/ps-gui.txt", cmd: []string{"sh", "-c", `
+ps -ef | grep -iE 'lightdm|Xorg|X$|openbox|chromium|chrome|xinit|xsession' | grep -v grep || echo "no GUI processes found"
+`}},
+	{name: "system/lspci-video-vv.txt", cmd: []string{"sh", "-c", `
+if ! command -v lspci >/dev/null 2>&1; then
+  echo "lspci not found"
+  exit 0
+fi
+found=0
+for dev in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ {print $1}'); do
+  found=1
+  echo "=== $dev ==="
+  lspci -s "$dev" -vv 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no display-class PCI devices found"
+fi
+`}},
+	{name: "system/proc-fb.txt", cmd: []string{"cat", "/proc/fb"}},
+	{name: "system/drm-cards.txt", cmd: []string{"sh", "-c", `
+if [ -d /sys/class/drm ]; then
+  for path in /sys/class/drm/card*; do
+    [ -e "$path" ] || continue
+    card=$(basename "$path")
+    echo "=== $card ==="
+    for f in status enabled dpms modes; do
+      [ -r "$path/$f" ] && printf "  %-8s %s\n" "$f" "$(cat "$path/$f" 2>/dev/null)"
+    done
+    device=$(readlink -f "$path/device" 2>/dev/null || true)
+    [ -n "$device" ] && echo "  device   ${device##*/}"
+    echo
+  done
+else
+  echo "/sys/class/drm not present"
+fi
+`}},
+	{name: "system/input-devices.txt", cmd: []string{"sh", "-c", `
+if [ -r /proc/bus/input/devices ]; then
+  cat /proc/bus/input/devices
+else
+  echo "/proc/bus/input/devices not readable"
+fi
+`}},
+	{name: "system/udevadm-input.txt", cmd: []string{"sh", "-c", `
+if ! command -v udevadm >/dev/null 2>&1; then
+  echo "udevadm not found"
+  exit 0
+fi
+found=0
+for dev in /dev/input/event*; do
+  [ -e "$dev" ] || continue
+  found=1
+  echo "=== $dev ==="
+  udevadm info --query=all --name="$dev" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no /dev/input/event* devices found"
+fi
+`}},
+	{name: "system/xinput-list.txt", cmd: []string{"sh", "-c", `
+if command -v xinput >/dev/null 2>&1; then
+  DISPLAY=:0 xinput --list 2>&1 || true
+else
+  echo "xinput not found"
+fi
+`}},
+	{name: "system/libinput-list-devices.txt", cmd: []string{"sh", "-c", `
+if command -v libinput >/dev/null 2>&1; then
+  libinput list-devices 2>&1 || true
+else
+  echo "libinput not found"
+fi
+`}},
+	{name: "system/systemctl-gui-units.txt", cmd: []string{"sh", "-c", `
+if ! command -v systemctl >/dev/null 2>&1; then
+  echo "systemctl not found"
+  exit 0
+fi
+echo "=== unit files ==="
+systemctl list-unit-files --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
+echo
+echo "=== active units ==="
+systemctl list-units --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
+echo
+echo "=== failed units ==="
+systemctl --failed --no-pager 2>&1 | grep -iE 'lightdm|display-manager|Xorg' || echo "no failed GUI units"
 `}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
+if command -v nvidia-smi >/dev/null 2>&1; then
+  nvidia-smi topo -m 2>&1 || true
+else
+  echo "nvidia-smi not found"
+fi
+`}},
+	{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
+if ! command -v systemctl >/dev/null 2>&1; then
+  echo "systemctl not found"
+  exit 0
+fi
+echo "=== unit files ==="
+systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
+echo
+echo "=== active units ==="
+systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
+echo
+echo "=== failed units ==="
+systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
+`}},
+	{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
+for candidate in \
+  /usr/bin/nvidia-fabricmanager \
+  /usr/bin/nv-fabricmanager \
+  /usr/bin/nvidia-fabricmanagerd \
+  /usr/bin/nvlsm; do
+  if [ -e "$candidate" ]; then
+    echo "=== $candidate ==="
+    ls -l "$candidate" 2>&1 || true
+    echo
+  fi
+done
+if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
+  echo "no fabric manager binaries found"
+fi
+`}},
 	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
 if ! command -v lspci >/dev/null 2>&1; then
  echo "lspci not found"
@@ -195,6 +354,17 @@ var supportBundleOptionalFiles = []struct {
 }{
 	{name: "system/kern.log", src: "/var/log/kern.log"},
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
+	{name: "system/Xorg.0.log", src: "/var/log/Xorg.0.log"},
+	{name: "system/Xorg.0.log.old", src: "/var/log/Xorg.0.log.old"},
+	{name: "system/lightdm/lightdm.log", src: "/var/log/lightdm/lightdm.log"},
+	{name: "system/lightdm/x-0.log", src: "/var/log/lightdm/x-0.log"},
+	{name: "system/lightdm/x-0-greeter.log", src: "/var/log/lightdm/x-0-greeter.log"},
+	{name: "system/home-bee-xsession-errors.log", src: "/home/bee/.xsession-errors"},
+	{name: "system/home-bee-chromium-debug.log", src: "/tmp/bee-chrome/chrome_debug.log"},
+	{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
+	{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
+	{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
+	{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
 }

 const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
@@ -212,11 +382,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	}

 	now := time.Now().UTC()
-	date := now.Format("2006-01-02")
-	tod := now.Format("150405")
-	ver := bundleVersion()
-	model := serverModelForBundle()
-	sn := serverSerialForBundle()

 	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
@@ -250,7 +415,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
+	archiveName := SupportBundleBaseName(now) + ".tar.gz"
 	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
@@ -258,6 +423,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	return archivePath, nil
 }

+func SupportBundleBaseName(at time.Time) string {
+	at = at.UTC()
+	date := at.Format("2006-01-02")
+	tod := at.Format("150405")
+	ver := bundleVersion()
+	model := serverModelForBundle()
+	sn := serverSerialForBundle()
+	return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
+}
+
 func LatestSupportBundlePath() (string, error) {
 	return latestSupportBundlePath(os.TempDir())
 }
@@ -381,6 +556,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
 	fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
+	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
+		fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
+		fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
+		if strings.TrimSpace(cfg.Reason) != "" {
+			fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
+		}
+	}
 	fmt.Fprintf(&body, "\nfiles:\n")

 	var files []string
--- a/audit/internal/collector/amdgpu.go
+++ b/audit/internal/collector/amdgpu.go
@@ -84,11 +84,10 @@ func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
 }

 func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
-	if dev.Manufacturer == nil || dev.DeviceClass == nil {
+	if dev.DeviceClass == nil {
 		return false
 	}
-	manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
-	return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
+	return dev.VendorID != nil && *dev.VendorID == AMDVendorID && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
 }

 func queryAMDGPUs() (map[string]amdGPUInfo, error) {
--- a/audit/internal/collector/board.go
+++ b/audit/internal/collector/board.go
@@ -3,6 +3,7 @@ package collector
 import (
 	"bee/audit/internal/schema"
 	"bufio"
+	"context"
 	"log/slog"
 	"os"
 	"os/exec"
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
 	return string(out), nil
 }

-var execIpmitool = func(args ...string) (string, error) {
-	out, err := exec.Command("ipmitool", args...).Output()
-	if err != nil {
-		return "", err
-	}
-	return string(out), nil
-}
-
 // collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
 // plus the BIOS firmware entry. Any failure is logged and returns zero values.
 func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {

 // collectBMCFirmware collects BMC firmware version via ipmitool mc info.
 // Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
-func collectBMCFirmware() []schema.HardwareFirmwareRecord {
+func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
 	if _, err := exec.LookPath("ipmitool"); err != nil {
 		return nil
 	}
 	if _, err := os.Stat("/dev/ipmi0"); err != nil {
 		return nil
 	}
-	out, err := execIpmitool("mc", "info")
+	profile := selectIPMIProfile(manufacturer)
+	ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
+	defer cancel()
+	cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
+	raw, err := cmd.Output()
 	if err != nil {
 		slog.Info("bmc: ipmitool mc info unavailable", "err", err)
 		return nil
 	}
-	version := parseBMCFirmwareRevision(out)
+	version := parseBMCFirmwareRevision(string(raw))
 	if version == "" {
 		return nil
 	}
--- a/audit/internal/collector/collector.go
+++ b/audit/internal/collector/collector.go
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
 	board, biosFW := collectBoard()
 	snap.Board = board
 	snap.Firmware = append(snap.Firmware, biosFW...)
-	snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
+	snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)

 	snap.CPUs = collectCPUs()

@@ -34,17 +34,20 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
 	}
 	snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
 	snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
+	bestEffortRescanHotplugStorage()
 	snap.Storage = collectStorage()
 	snap.PCIeDevices = collectPCIe()
 	snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
+	snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
 	snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
 	snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
-	snap.PowerSupplies = collectPSUs()
+	snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
+	snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
 	snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
 	snap.Sensors = buildSensorsFromDoc(sensorDoc)
 	finalizeSnapshot(&snap, collectedAt)
--- a/audit/internal/collector/ipmi_profile.go
+++ b/audit/internal/collector/ipmi_profile.go
@@ -0,0 +1,92 @@
+package collector
+
+// Package-level IPMI tuning profiles.
+//
+// Each profile is matched by board manufacturer (already known before PSU
+// collection runs). The profile drives two things:
+//   - Per-command timeouts — prevents infinite hangs on slow BMCs.
+//   - FRU early-exit — streaming parser stops reading once all PSU entries
+//     are found, avoiding the tail of non-PSU FRU records.
+//
+// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
+
+import (
+	"strings"
+	"time"
+)
+
+// ipmiProfile holds tuning parameters for one or more board manufacturers.
+type ipmiProfile struct {
+	// name is shown in log messages.
+	name string
+	// manufacturers is a list of lowercase substrings matched against the
+	// board manufacturer string from dmidecode type 1.
+	manufacturers []string
+	// fruTimeout is the hard deadline for the entire `ipmitool fru print`
+	// command. Zero means no timeout (not recommended).
+	fruTimeout time.Duration
+	// sdrTimeout is the hard deadline for `ipmitool sdr`.
+	sdrTimeout time.Duration
+	// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
+	mcInfoTimeout time.Duration
+	// fruEarlyExit instructs the streaming FRU parser to stop reading
+	// after it has found at least one PSU entry and the current block is
+	// complete. Useful on servers with many non-PSU FRU devices.
+	fruEarlyExit bool
+}
+
+// ipmiProfiles is the ordered list of profiles. First match wins.
+var ipmiProfiles = []ipmiProfile{
+	{
+		// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
+		// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
+		// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
+		// exit so collection stops once PSU records are found.
+		name:          "lenovo",
+		manufacturers: []string{"lenovo"},
+		fruTimeout:    90 * time.Second,
+		sdrTimeout:    45 * time.Second,
+		mcInfoTimeout: 15 * time.Second,
+		fruEarlyExit:  true,
+	},
+	{
+		// HPE iLO-based servers (ProLiant DL/ML/BL).
+		name:          "hpe",
+		manufacturers: []string{"hp", "hewlett packard"},
+		fruTimeout:    60 * time.Second,
+		sdrTimeout:    30 * time.Second,
+		mcInfoTimeout: 10 * time.Second,
+		fruEarlyExit:  false,
+	},
+	{
+		// Dell iDRAC-based servers.
+		name:          "dell",
+		manufacturers: []string{"dell"},
+		fruTimeout:    60 * time.Second,
+		sdrTimeout:    30 * time.Second,
+		mcInfoTimeout: 10 * time.Second,
+		fruEarlyExit:  false,
+	},
+}
+
+// defaultIPMIProfile is used when no vendor profile matches.
+var defaultIPMIProfile = ipmiProfile{
+	name:          "default",
+	fruTimeout:    60 * time.Second,
+	sdrTimeout:    30 * time.Second,
+	mcInfoTimeout: 10 * time.Second,
+	fruEarlyExit:  false,
+}
+
+// selectIPMIProfile returns the profile for the given board manufacturer.
+func selectIPMIProfile(manufacturer string) ipmiProfile {
+	mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
+	for _, p := range ipmiProfiles {
+		for _, m := range p.manufacturers {
+			if strings.Contains(mfgLower, m) {
+				return p
+			}
+		}
+	}
+	return defaultIPMIProfile
+}
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -11,7 +11,6 @@ import (
 	"time"
 )

-const mellanoxVendorID = 0x15b3
 const nicProbeTimeout = 2 * time.Second

 var (
@@ -80,16 +79,7 @@ func enrichPCIeWithMellanox(devs []schema.HardwarePCIeDevice) []schema.HardwareP
 }

 func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
-	if dev.VendorID != nil && *dev.VendorID == mellanoxVendorID {
-		return true
-	}
-	if dev.Manufacturer != nil {
-		m := strings.ToLower(*dev.Manufacturer)
-		if strings.Contains(m, "mellanox") || strings.Contains(m, "nvidia networking") {
-			return true
-		}
-	}
-	return false
+	return dev.VendorID != nil && *dev.VendorID == MellanoxVendorID
 }

 func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
--- a/audit/internal/collector/nic_mellanox_test.go
+++ b/audit/internal/collector/nic_mellanox_test.go
@@ -55,7 +55,7 @@ func TestEnrichPCIeWithMellanox_mstflint(t *testing.T) {
 	}
 	netIfacesByBDF = func(string) []string { return nil }

-	vendorID := mellanoxVendorID
+	vendorID := MellanoxVendorID
 	bdf := "0000:18:00.0"
 	manufacturer := "Mellanox Technologies"
 	devs := []schema.HardwarePCIeDevice{{
@@ -99,7 +99,7 @@ func TestEnrichPCIeWithMellanox_fallbackEthtool(t *testing.T) {
 		return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
 	}

-	vendorID := mellanoxVendorID
+	vendorID := MellanoxVendorID
 	bdf := "0000:18:00.0"
 	manufacturer := "NVIDIA Networking"
 	devs := []schema.HardwarePCIeDevice{{
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -10,8 +10,6 @@ import (
 	"strings"
 )

-const nvidiaVendorID = 0x10de
-
 type nvidiaGPUInfo struct {
 	Index              int
 	BDF                string
@@ -240,13 +238,7 @@ func normalizePCIeBDF(bdf string) string {
 }

 func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
-	if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
-		return true
-	}
-	if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
-		return true
-	}
-	return false
+	return dev.VendorID != nil && *dev.VendorID == NvidiaVendorID
 }

 func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -57,7 +57,7 @@ func TestNormalizePCIeBDF(t *testing.T) {
 }

 func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
-	vendorID := nvidiaVendorID
+	vendorID := NvidiaVendorID
 	bdf := "0000:65:00.0"
 	manufacturer := "NVIDIA Corporation"
 	status := "OK"
@@ -104,7 +104,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
 }

 func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
-	vendorID := nvidiaVendorID
+	vendorID := NvidiaVendorID
 	bdf := "0000:17:00.0"
 	manufacturer := "NVIDIA Corporation"
 	devices := []schema.HardwarePCIeDevice{
--- a/audit/internal/collector/pci_vendors.go
+++ b/audit/internal/collector/pci_vendors.go
@@ -0,0 +1,11 @@
+package collector
+
+// PCI vendor IDs for hardware classification.
+// Source: https://pcisig.com / https://pci-ids.ucw.cz/
+const (
+	NvidiaVendorID   = 0x10de
+	AMDVendorID      = 0x1002
+	AspeedVendorID   = 0x1a03
+	MellanoxVendorID = 0x15b3
+	IntelVendorID    = 0x8086
+)
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -4,7 +4,9 @@ import (
 	"bee/audit/internal/schema"
 	"fmt"
 	"log/slog"
+	"os"
 	"os/exec"
+	"path/filepath"
 	"strconv"
 	"strings"
 )
@@ -124,35 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
 	dev.Status = &status

 	// Slot is the BDF: "0000:00:02.0"
-	if bdf := fields["Slot"]; bdf != "" {
-		dev.Slot = &bdf
-		dev.BDF = &bdf
+	bdfStr := fields["Slot"]
+	if bdfStr != "" {
+		dev.Slot = &bdfStr
+		dev.BDF = &bdfStr
 		// parse vendor_id and device_id from sysfs
-		vendorID, deviceID := readPCIIDs(bdf)
+		vendorID, deviceID := readPCIIDs(bdfStr)
 		if vendorID != 0 {
 			dev.VendorID = &vendorID
 		}
 		if deviceID != 0 {
 			dev.DeviceID = &deviceID
 		}
-		if numaNode, ok := readPCINumaNode(bdf); ok {
+		if numaNode, ok := readPCINumaNode(bdfStr); ok {
 			dev.NUMANode = &numaNode
 		} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
 			dev.NUMANode = &numaNode
 		}
-		if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
+		if group, ok := readPCIIOMMUGroup(bdfStr); ok {
+			dev.IOMMUGroup = &group
+		}
+		if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
 			dev.LinkWidth = &width
 		}
-		if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
+		if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
 			dev.MaxLinkWidth = &width
 		}
-		if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
+		if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
 			linkSpeed := normalizePCILinkSpeed(speed)
 			if linkSpeed != "" {
 				dev.LinkSpeed = &linkSpeed
 			}
 		}
-		if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
+		if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
 			linkSpeed := normalizePCILinkSpeed(speed)
 			if linkSpeed != "" {
 				dev.MaxLinkSpeed = &linkSpeed
@@ -173,12 +179,35 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {

 	// SVendor/SDevice available but not in schema — skip

-	// Warn if PCIe link is running below its maximum negotiated speed.
+	// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
+	// These are Mellanox x2 devices with no host net interfaces and a DeviceName
+	// containing "NVLINK". The targeted lspci call is only executed for the small
+	// number of narrow-link Mellanox cards that pass the cheap pre-filter.
+	if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
+		markNVLinkBridge(&dev)
+	}
+
+	// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
 	applyPCIeLinkSpeedWarning(&dev)

 	return dev
 }

+// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
+// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
+func readPCIIOMMUGroup(bdf string) (int, bool) {
+	link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
+	target, err := os.Readlink(link)
+	if err != nil {
+		return 0, false
+	}
+	n, err := strconv.Atoi(filepath.Base(target))
+	if err != nil {
+		return 0, false
+	}
+	return n, true
+}
+
 // readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
 func readPCIIDs(bdf string) (vendorID, deviceID int) {
 	base := "/sys/bus/pci/devices/" + bdf
@@ -245,17 +274,37 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
 	return value, true
 }

-// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
-// speed is below the maximum negotiated speed supported by both ends.
+// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
+// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
+// get Critical because they are fixed internal connectors that must always train
+// to max speed — any downgrade signals a hardware fault.
+//
+// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and
+// their link state has no operational impact. This covers management endpoints
+// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never
+// activates but that lspci still reports with link stats.
 func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
 	if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
 		return
 	}
-	if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
+	if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
+		return
+	}
+	if dev.BDF != nil {
+		if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 {
+			return
+		}
+	}
+	desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
+	dev.ErrorDescription = &desc
+
+	isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
+	if isNVLinkBridge {
+		crit := statusCritical
+		dev.Status = &crit
+	} else {
 		warn := statusWarning
 		dev.Status = &warn
-		desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
-		dev.ErrorDescription = &desc
 	}
 }

--- a/audit/internal/collector/pcie_nvlink_bridge.go
+++ b/audit/internal/collector/pcie_nvlink_bridge.go
@@ -0,0 +1,206 @@
+package collector
+
+import (
+	"bee/audit/internal/schema"
+	"log/slog"
+	"os/exec"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
+
+// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
+// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
+// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
+func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
+	if !isMellanoxDevice(dev) {
+		return false
+	}
+	if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
+		return false
+	}
+	if len(netIfacesByBDF(bdf)) > 0 {
+		return false
+	}
+	return true
+}
+
+// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
+// "NVLINK". This is a targeted single-device call, only executed for candidates
+// already pre-filtered by isNVLinkBridgeCandidate.
+func confirmNVLinkBridgeDeviceName(bdf string) bool {
+	out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
+	if err != nil {
+		return false
+	}
+	for _, line := range strings.Split(string(out), "\n") {
+		if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
+			return true
+		}
+	}
+	return false
+}
+
+// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
+// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
+// correct severity (Critical) is applied.
+func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
+	class := "NVLinkBridge"
+	dev.DeviceClass = &class
+	if dev.Telemetry == nil {
+		dev.Telemetry = map[string]any{}
+	}
+	dev.Telemetry["nvlink_bridge"] = true
+}
+
+// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
+// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
+// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
+// upgrades a degraded-link Warning to Critical when the fabric is also affected.
+func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
+	hasBridge := false
+	for _, d := range devs {
+		if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
+			hasBridge = true
+			break
+		}
+	}
+	if !hasBridge {
+		return devs
+	}
+
+	topo, err := queryNVIDIANVLinkTopo()
+	if err != nil {
+		slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
+		return devs
+	}
+
+	for i := range devs {
+		if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
+			continue
+		}
+		if devs[i].Telemetry == nil {
+			devs[i].Telemetry = map[string]any{}
+		}
+		devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
+		devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
+		devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
+
+		// If the bridge PCIe is already degraded AND the fabric is also degraded
+		// (missing NVLink connections), escalate to Critical.
+		if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
+			devs[i].Telemetry["nvlink_fabric_affected"] = true
+		}
+	}
+
+	slog.Info("nvlink-bridge: topo cross-reference applied",
+		"gpu_count", topo.GPUCount,
+		"all_active", topo.AllActive,
+		"min_links", topo.MinNVLinks,
+	)
+	return devs
+}
+
+// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
+type nvlinkTopoResult struct {
+	GPUCount   int
+	AllActive  bool // true if every GPU pair has at least one NVLink bond
+	MinNVLinks int  // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
+}
+
+// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
+func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
+	out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
+	if err != nil {
+		return nvlinkTopoResult{}, err
+	}
+	return parseNVIDIATopologyMatrix(string(out)), nil
+}
+
+// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
+// nvidia-smi topo -m matrix.
+//
+// Format (abbreviated):
+//
+//	         GPU0  GPU1 ... NIC0 NIC1
+//	GPU0      X   NV18 ... NODE NODE
+//	GPU1     NV18   X  ... NODE NODE
+//	NIC0     NODE  NODE...   X   PIX
+//
+// The header row starts with "GPU0"; its columns may include non-GPU entries
+// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
+// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
+func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
+	lines := strings.Split(raw, "\n")
+
+	// Locate the header line and record which column indices are GPU columns.
+	headerIdx := -1
+	var gpuColIndices []int // 0-based indices within fields (excluding the row label)
+	var gpuCount int
+	for i, line := range lines {
+		trimmed := strings.TrimSpace(line)
+		if strings.HasPrefix(trimmed, "GPU0") {
+			parts := strings.Fields(trimmed)
+			for j, col := range parts {
+				if strings.HasPrefix(col, "GPU") {
+					gpuColIndices = append(gpuColIndices, j)
+				}
+			}
+			gpuCount = len(gpuColIndices)
+			if gpuCount >= 2 {
+				headerIdx = i
+			}
+			break
+		}
+	}
+	if headerIdx < 0 || gpuCount == 0 {
+		return nvlinkTopoResult{}
+	}
+
+	minLinks := -1 // -1 = no NV pair seen yet
+	allActive := true
+
+	for _, line := range lines[headerIdx+1:] {
+		trimmed := strings.TrimSpace(line)
+		if !strings.HasPrefix(trimmed, "GPU") {
+			continue
+		}
+		cells := strings.Fields(trimmed)
+		// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
+		// gpuColIndices are 0-based within the header fields, so they map to
+		// cells[idx+1] in the data rows (shift by 1 for the row label).
+		for _, colIdx := range gpuColIndices {
+			dataIdx := colIdx + 1
+			if dataIdx >= len(cells) {
+				continue
+			}
+			cell := cells[dataIdx]
+			m := nv5re.FindStringSubmatch(cell)
+			if len(m) != 2 {
+				continue
+			}
+			n, err := strconv.Atoi(m[1])
+			if err != nil {
+				continue
+			}
+			if n == 0 {
+				allActive = false
+			}
+			if minLinks < 0 || n < minLinks {
+				minLinks = n
+			}
+		}
+	}
+
+	if minLinks < 0 {
+		minLinks = 0
+	}
+
+	return nvlinkTopoResult{
+		GPUCount:   gpuCount,
+		AllActive:  allActive && minLinks > 0,
+		MinNVLinks: minLinks,
+	}
+}
--- a/audit/internal/collector/pcie_nvlink_bridge_test.go
+++ b/audit/internal/collector/pcie_nvlink_bridge_test.go
@@ -0,0 +1,124 @@
+package collector
+
+import (
+	"bee/audit/internal/schema"
+	"testing"
+)
+
+func TestParseNVIDIATopologyMatrix(t *testing.T) {
+	t.Parallel()
+
+	// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
+	input := `	GPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	NIC0	NIC1
+GPU0	 X 	NV18	NV18	NV18	NV18	NV18	NV18	NV18	NODE	NODE
+GPU1	NV18	 X 	NV18	NV18	NV18	NV18	NV18	NV18	NODE	NODE
+GPU2	NV18	NV18	 X 	NV18	NV18	NV18	NV18	NV18	NODE	NODE
+GPU3	NV18	NV18	NV18	 X 	NV18	NV18	NV18	NV18	NODE	NODE
+GPU4	NV18	NV18	NV18	NV18	 X 	NV18	NV18	NV18	SYS	SYS
+GPU5	NV18	NV18	NV18	NV18	NV18	 X 	NV18	NV18	SYS	SYS
+GPU6	NV18	NV18	NV18	NV18	NV18	NV18	 X 	NV18	SYS	SYS
+GPU7	NV18	NV18	NV18	NV18	NV18	NV18	NV18	 X 	SYS	SYS
+NIC0	NODE	NODE	NODE	NODE	SYS	SYS	SYS	SYS	 X 	PIX
+`
+	got := parseNVIDIATopologyMatrix(input)
+
+	if got.GPUCount != 8 {
+		t.Fatalf("GPUCount=%d want 8", got.GPUCount)
+	}
+	if !got.AllActive {
+		t.Fatalf("AllActive=false want true")
+	}
+	if got.MinNVLinks != 18 {
+		t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
+	}
+}
+
+func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
+	t.Parallel()
+
+	// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
+	input := `	GPU0	GPU1	GPU2	GPU3
+GPU0	 X 	NV18	NV18	NV18
+GPU1	NV18	 X 	NV18	NV12
+GPU2	NV18	NV18	 X 	NV18
+GPU3	NV18	NV12	NV18	 X
+`
+	got := parseNVIDIATopologyMatrix(input)
+
+	if got.MinNVLinks != 12 {
+		t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
+	}
+	if !got.AllActive {
+		t.Fatalf("AllActive=false want true (12 links is still active)")
+	}
+}
+
+func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
+	t.Parallel()
+
+	// GPU0-GPU1 pair fully disconnected (NV0).
+	input := `	GPU0	GPU1
+GPU0	 X 	NV0
+GPU1	NV0	 X
+`
+	got := parseNVIDIATopologyMatrix(input)
+
+	if got.AllActive {
+		t.Fatalf("AllActive=true want false (NV0 means no links)")
+	}
+	if got.MinNVLinks != 0 {
+		t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
+	}
+}
+
+func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
+	t.Parallel()
+
+	got := parseNVIDIATopologyMatrix("no gpus here")
+	if got.GPUCount != 0 {
+		t.Fatalf("GPUCount=%d want 0", got.GPUCount)
+	}
+}
+
+func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
+	t.Parallel()
+
+	bridgeClass := "NVLinkBridge"
+	linkSpeed := "Gen3"
+	maxLinkSpeed := "Gen4"
+	dev := schema.HardwarePCIeDevice{}
+	dev.DeviceClass = &bridgeClass
+	dev.LinkSpeed = &linkSpeed
+	dev.MaxLinkSpeed = &maxLinkSpeed
+	s := statusOK
+	dev.Status = &s
+
+	applyPCIeLinkSpeedWarning(&dev)
+
+	if dev.Status == nil || *dev.Status != statusCritical {
+		t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
+	}
+	if dev.ErrorDescription == nil {
+		t.Fatal("ErrorDescription nil, want degradation message")
+	}
+}
+
+func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
+	t.Parallel()
+
+	regularClass := "NetworkController"
+	linkSpeed := "Gen3"
+	maxLinkSpeed := "Gen4"
+	dev := schema.HardwarePCIeDevice{}
+	dev.DeviceClass = &regularClass
+	dev.LinkSpeed = &linkSpeed
+	dev.MaxLinkSpeed = &maxLinkSpeed
+	s := statusOK
+	dev.Status = &s
+
+	applyPCIeLinkSpeedWarning(&dev)
+
+	if dev.Status == nil || *dev.Status != statusWarning {
+		t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
+	}
+}
--- a/audit/internal/collector/psu.go
+++ b/audit/internal/collector/psu.go
@@ -2,6 +2,8 @@ package collector

 import (
 	"bee/audit/internal/schema"
+	"bufio"
+	"context"
 	"log/slog"
 	"os/exec"
 	"regexp"
@@ -10,16 +12,29 @@ import (
 	"strings"
 )

-func collectPSUs() []schema.HardwarePowerSupply {
+func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
+	profile := selectIPMIProfile(manufacturer)
+
 	var psus []schema.HardwarePowerSupply
-	if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
-		psus = parseFRU(string(out))
+	fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
+	defer fruCancel()
+
+	if profile.fruEarlyExit {
+		psus = collectFRUEarlyExit(fruCtx)
 	} else {
-		slog.Info("psu: fru unavailable", "err", err)
+		cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
+		if out, err := cmd.Output(); err == nil {
+			psus = parseFRU(string(out))
+		} else {
+			slog.Info("psu: fru unavailable", "err", err)
+		}
 	}

 	sdrData := map[int]psuSDR{}
-	if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
+	sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
+	defer sdrCancel()
+	cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
+	if sdrOut, err := cmd.Output(); err == nil {
 		sdrData = parsePSUSDR(string(sdrOut))
 		if len(psus) == 0 {
 			psus = synthesizePSUsFromSDR(sdrData)
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
 		slog.Info("psu: ipmitool unavailable, skipping", "err", err)
 		return nil
 	}
-	slog.Info("psu: collected", "count", len(psus))
+	slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
+	return psus
+}
+
+// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
+// as soon as it has found all PSU blocks and the next block is not a PSU.
+// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
+func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
+	cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
+	pipe, err := cmd.StdoutPipe()
+	if err != nil {
+		slog.Info("psu: fru pipe unavailable", "err", err)
+		return nil
+	}
+	if err := cmd.Start(); err != nil {
+		slog.Info("psu: fru start failed", "err", err)
+		return nil
+	}
+
+	var psus []schema.HardwarePowerSupply
+	var currentBlock strings.Builder
+	slot := 0
+	psuFound := false
+	stoppedEarly := false
+
+	scanner := bufio.NewScanner(pipe)
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		if strings.HasPrefix(line, "FRU Device Description") {
+			if currentBlock.Len() > 0 {
+				if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
+					psus = append(psus, psu)
+					psuFound = true
+					slot++
+				}
+				currentBlock.Reset()
+			}
+			// Stop once we've collected PSUs and hit a non-PSU block header.
+			if psuFound && !isPSUHeader(strings.ToLower(line)) {
+				stoppedEarly = true
+				break
+			}
+		}
+		currentBlock.WriteString(line)
+		currentBlock.WriteByte('\n')
+	}
+
+	if !stoppedEarly && currentBlock.Len() > 0 {
+		if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
+			psus = append(psus, psu)
+		}
+	}
+
+	// Kill the process immediately on early exit rather than waiting for context timeout.
+	if cmd.Process != nil {
+		cmd.Process.Kill() //nolint:errcheck
+	}
+	cmd.Wait() //nolint:errcheck
+	slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
 	return psus
 }

@@ -160,11 +234,57 @@ type psuSDR struct {
 }

 var psuSlotPatterns = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
+	// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
+	// does not fire after the digit; match explicitly with underscore terminator.
+	regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
+	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),                    // PSU1, PS1, ps 2
+	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),                      // PS 6, PS6
+	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),                     // PWS1
+	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
+	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),                     // Bay 1
+	// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
+	// Must be last: "power supply N" is already caught by the pattern above.
+	regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
+}
+
+// psuInputPowerKeywords matches AC-input power sensor names across vendors:
+//   MSI:     PSU1_POWER_IN, PSU1_PIN
+//   MLT:     PSU1_PIN
+//   xFusion: (matched via default fallback — no explicit keyword)
+//   HPE:     PS1 Input Power, PS1 Input Watts
+func isPSUInputPower(name string) bool {
+	return strings.Contains(name, "input power") ||
+		strings.Contains(name, "input watts") ||
+		strings.Contains(name, "_pin") ||
+		strings.Contains(name, " pin") ||
+		strings.Contains(name, "_power_in") ||
+		strings.Contains(name, "power_in")
+}
+
+// isPSUOutputPower matches DC-output power sensor names across vendors:
+//   MSI:     PSU1_POWER_OUT
+//   MLT:     PSU1_POUT
+//   xFusion: PS1 POut
+func isPSUOutputPower(name string) bool {
+	return strings.Contains(name, "output power") ||
+		strings.Contains(name, "output watts") ||
+		strings.Contains(name, "_pout") ||
+		strings.Contains(name, " pout") ||
+		strings.Contains(name, "_power_out") ||
+		strings.Contains(name, "power_out") ||
+		strings.Contains(name, "power supply bay") ||
+		strings.Contains(name, "psu bay")
+}
+
+// parseBoundedFloat parses a numeric value from an SDR value field and
+// validates it is within (0, max]. Returns nil for zero, negative, or
+// out-of-range values — these indicate missing/off/fault sensor readings.
+func parseBoundedFloat(raw string, max float64) *float64 {
+	v := parseFloatPtr(raw)
+	if v == nil || *v <= 0 || *v > max {
+		return nil
+	}
+	return v
 }

 func parsePSUSDR(raw string) map[int]psuSDR {
@@ -194,24 +314,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {

 		lowerName := strings.ToLower(name)
 		switch {
-		case strings.Contains(lowerName, "input power"):
-			entry.inputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "output power"):
-			entry.outputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
-			entry.outputPowerW = parseFloatPtr(value)
+		case isPSUInputPower(lowerName):
+			entry.inputPowerW = parseBoundedFloat(value, 6000)
+		case isPSUOutputPower(lowerName):
+			entry.outputPowerW = parseBoundedFloat(value, 6000)
 		case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
 			entry.inputVoltage = parseFloatPtr(value)
 		case strings.Contains(lowerName, "temp"):
 			entry.temperatureC = parseFloatPtr(value)
 		case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
 			entry.healthPct = parsePercentPtr(value)
+		default:
+			// Generic PSU power reading: sensor matched a slot pattern but carries
+			// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
+			// AC input if the value looks like wattage and no better data is set yet.
+			if entry.inputPowerW == nil {
+				entry.inputPowerW = parseBoundedFloat(value, 6000)
+			}
 		}
 		out[slot] = entry
 	}
 	return out
 }

+// PSUSlotPower holds SDR power readings for one PSU slot.
+// Slot key used by PSUSlotsFromSDR is the 0-based index string,
+// matching HardwarePowerSupply.Slot in the audit schema.
+type PSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`
+	OutputW *float64 `json:"output_w,omitempty"`
+	Status  string   `json:"status,omitempty"`
+}
+
+// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
+// using the same battle-tested slot patterns as the hardware audit collector.
+// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
+// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
+func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
+	sdr := parsePSUSDR(sdrOutput)
+	if len(sdr) == 0 {
+		return nil
+	}
+	out := make(map[string]PSUSlotPower, len(sdr))
+	for slot, entry := range sdr {
+		key := strconv.Itoa(slot - 1) // audit uses 0-based slot
+		out[key] = PSUSlotPower{
+			InputW:  entry.inputPowerW,
+			OutputW: entry.outputPowerW,
+			Status:  entry.status,
+		}
+	}
+	return out
+}
+
 func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
 	if len(sdr) == 0 {
 		return nil
--- a/audit/internal/collector/psu_sdr_test.go
+++ b/audit/internal/collector/psu_sdr_test.go
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 		{name: "PWS1 Status", want: 1},
 		{name: "Power Supply Bay 8", want: 8},
 		{name: "PS 6 Input Power", want: 6},
+		// MSI underscore format — \b does not fire between digit and '_'
+		{name: "PSU1_POWER_IN", want: 1},
+		{name: "PSU2_POWER_OUT", want: 2},
+		{name: "PSU4_STATUS", want: 4},
 	}

 	for _, tt := range tests {
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 	}
 }

+func TestParsePSUSDRMSIFormat(t *testing.T) {
+	t.Parallel()
+	raw := `
+PSU1_STATUS      | F1h | ok
+PSU1_POWER_OUT   | 928 Watts | ok
+PSU1_POWER_IN    | 976 Watts | ok
+PSU2_STATUS      | F2h | ok
+PSU2_POWER_OUT   | 944 Watts | ok
+PSU2_POWER_IN    | 992 Watts | ok
+`
+	got := parsePSUSDR(raw)
+	if len(got) != 2 {
+		t.Fatalf("len(got)=%d want 2", len(got))
+	}
+	if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
+		t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
+	}
+	if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
+		t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
+	}
+	if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
+		t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
+	}
+}
+
 func TestSynthesizePSUsFromSDR(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/collector/raid.go
+++ b/audit/internal/collector/raid.go
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
 	return arrays
 }

+// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
+// Returns nil when VROC is absent or the platform does not report a license.
+func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
+	if !hasVROCController(pcie) {
+		return nil
+	}
+	out, err := raidToolQuery("mdadm", "--detail-platform")
+	if err != nil {
+		slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
+		return nil
+	}
+	return parseMDAdmPlatformLicense(string(out))
+}
+
+func parseMDAdmPlatformLicense(raw string) *string {
+	for _, line := range strings.Split(raw, "\n") {
+		trimmed := strings.TrimSpace(line)
+		if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
+			continue
+		}
+		if idx := strings.Index(trimmed, ":"); idx >= 0 {
+			val := strings.TrimSpace(trimmed[idx+1:])
+			if val != "" {
+				v := strings.ToLower(val)
+				return &v
+			}
+		}
+	}
+	return nil
+}
+
 func queryDeviceSerial(devPath string) string {
 	if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
 		var ctrl nvmeIDCtrl
--- a/audit/internal/collector/sensors.go
+++ b/audit/internal/collector/sensors.go
@@ -58,7 +58,6 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {

 	for _, chip := range chips {
 		features := doc[chip]
-		location := sensorLocation(chip)

 		keys := make([]string, 0, len(features))
 		for key := range features {
@@ -80,25 +79,25 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
 			}
 			switch classifySensorFeature(feature) {
 			case "fan":
-				item := buildFanSensor(name, location, feature)
+				item := buildFanSensor(name, feature)
 				if item == nil || duplicateSensor(seen, "fan", item.Name) {
 					continue
 				}
 				result.Fans = append(result.Fans, *item)
 			case "temp":
-				item := buildTempSensor(name, location, feature)
+				item := buildTempSensor(name, feature)
 				if item == nil || duplicateSensor(seen, "temp", item.Name) {
 					continue
 				}
 				result.Temperatures = append(result.Temperatures, *item)
 			case "power":
-				item := buildPowerSensor(name, location, feature)
+				item := buildPowerSensor(name, feature)
 				if item == nil || duplicateSensor(seen, "power", item.Name) {
 					continue
 				}
 				result.Power = append(result.Power, *item)
 			default:
-				item := buildOtherSensor(name, location, feature)
+				item := buildOtherSensor(name, feature)
 				if item == nil || duplicateSensor(seen, "other", item.Name) {
 					continue
 				}
@@ -128,14 +127,6 @@ func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
 	return false
 }

-func sensorLocation(chip string) *string {
-	chip = strings.TrimSpace(chip)
-	if chip == "" {
-		return nil
-	}
-	return &chip
-}
-
 func classifySensorFeature(feature map[string]any) string {
 	for key := range feature {
 		switch {
@@ -154,24 +145,24 @@ func classifySensorFeature(feature map[string]any) string {
 	return "other"
 }

-func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
+func buildFanSensor(name string, feature map[string]any) *schema.HardwareFanSensor {
 	rpm, ok := firstFeatureInt(feature, "_input")
 	if !ok {
 		return nil
 	}
-	item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
+	item := &schema.HardwareFanSensor{Name: name, RPM: &rpm}
 	if status := sensorStatusFromFeature(feature); status != nil {
 		item.Status = status
 	}
 	return item
 }

-func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
+func buildTempSensor(name string, feature map[string]any) *schema.HardwareTemperatureSensor {
 	celsius, ok := firstFeatureFloat(feature, "_input")
 	if !ok {
 		return nil
 	}
-	item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
+	item := &schema.HardwareTemperatureSensor{Name: name, Celsius: &celsius}
 	if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
 		item.ThresholdWarningCelsius = &warning
 	}
@@ -186,8 +177,8 @@ func buildTempSensor(name string, location *string, feature map[string]any) *sch
 	return item
 }

-func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
-	item := &schema.HardwarePowerSensor{Name: name, Location: location}
+func buildPowerSensor(name string, feature map[string]any) *schema.HardwarePowerSensor {
+	item := &schema.HardwarePowerSensor{Name: name}
 	if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
 		item.PowerW = &v
 	}
@@ -206,12 +197,12 @@ func buildPowerSensor(name string, location *string, feature map[string]any) *sc
 	return item
 }

-func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
+func buildOtherSensor(name string, feature map[string]any) *schema.HardwareOtherSensor {
 	value, unit, ok := firstGenericSensorValue(feature)
 	if !ok {
 		return nil
 	}
-	item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
+	item := &schema.HardwareOtherSensor{Name: name, Value: &value}
 	if unit != "" {
 		item.Unit = &unit
 	}
--- a/audit/internal/collector/storage.go
+++ b/audit/internal/collector/storage.go
@@ -4,12 +4,70 @@ import (
 	"bee/audit/internal/schema"
 	"encoding/json"
 	"log/slog"
+	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"strconv"
 	"strings"
 )

+var (
+	pciRescanPath      = "/sys/bus/pci/rescan"
+	scsiHostScanGlob   = "/sys/class/scsi_host/host*/scan"
+	hotplugWriteFile   = os.WriteFile
+	hotplugExecCommand = exec.Command
+	hotplugGlob        = filepath.Glob
+	nvmeLBAFCompactRE  = regexp.MustCompile(`(?im)^\s*lbaf\s+\d+\s*:\s*ms:(\d+)\s+lbads:(\d+).*?\(in use\)\s*$`)
+	nvmeLBAFVerboseRE  = regexp.MustCompile(`(?im)^\s*LBA Format\s+\d+\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*?\(in use\)\s*$`)
+	sgReadcapBlockRE   = regexp.MustCompile(`(?im)logical block length\s*=\s*(\d+)\s+bytes`)
+	sgReadcapProtRE    = regexp.MustCompile(`(?im)prot_en\s*=\s*1`)
+)
+
+func bestEffortRescanHotplugStorage() {
+	if err := hotplugWriteFile(pciRescanPath, []byte("1\n"), 0644); err != nil {
+		slog.Info("storage: pci rescan skipped", "path", pciRescanPath, "err", err)
+	} else {
+		slog.Info("storage: triggered pci rescan for hotplug discovery")
+	}
+
+	hostPaths, err := hotplugGlob(scsiHostScanGlob)
+	if err != nil {
+		slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
+	} else {
+		for _, path := range hostPaths {
+			// SAS HBAs (e.g. smartpqi) block indefinitely in sas_user_scan when
+			// written to — SAS topology is discovered by the driver itself.
+			// Detect via two methods: (1) sas_host class registration, and
+			// (2) driver proc_name — smartpqi uses scsi_transport_sas but does
+			// not register a sas_host object, so (1) alone misses it.
+			host := filepath.Base(filepath.Dir(path))
+			if _, err := os.Stat("/sys/class/sas_host/" + host); err == nil {
+				slog.Info("storage: scsi host scan skipped (SAS host)", "path", path)
+				continue
+			}
+			if procName, err := os.ReadFile("/sys/class/scsi_host/" + host + "/proc_name"); err == nil {
+				switch strings.TrimSpace(string(procName)) {
+				case "smartpqi", "hpsa":
+					slog.Info("storage: scsi host scan skipped (SAS transport driver)",
+						"path", path, "driver", strings.TrimSpace(string(procName)))
+					continue
+				}
+			}
+			if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
+				slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
+				continue
+			}
+			slog.Info("storage: triggered scsi host scan", "path", path)
+		}
+	}
+
+	out, err := hotplugExecCommand("udevadm", "settle", "--timeout=10").CombinedOutput()
+	if err != nil {
+		slog.Info("storage: udev settle after hotplug rescan failed", "err", err, "output", strings.TrimSpace(string(out)))
+	}
+}
+
 func collectStorage() []schema.HardwareStorage {
 	devs := discoverStorageDevices()
 	result := make([]schema.HardwareStorage, 0, len(devs))
@@ -26,15 +84,41 @@ func collectStorage() []schema.HardwareStorage {
 	return result
 }

+// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
+// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
+// but older versions emit them as strings. This type handles both.
+type jsonInt64 int64
+
+func (j *jsonInt64) UnmarshalJSON(data []byte) error {
+	// bare number: 512
+	var n int64
+	if err := json.Unmarshal(data, &n); err == nil {
+		*j = jsonInt64(n)
+		return nil
+	}
+	// quoted string: "512"
+	var s string
+	if err := json.Unmarshal(data, &s); err == nil {
+		n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
+		if err == nil {
+			*j = jsonInt64(n)
+		}
+		return nil
+	}
+	return nil // null or unexpected type — leave zero
+}
+
 // lsblkDevice is a minimal lsblk JSON record.
 type lsblkDevice struct {
-	Name   string `json:"name"`
-	Type   string `json:"type"`
-	Size   string `json:"size"`
-	Serial string `json:"serial"`
-	Model  string `json:"model"`
-	Tran   string `json:"tran"`
-	Hctl   string `json:"hctl"`
+	Name   string   `json:"name"`
+	Type   string   `json:"type"`
+	Size   string   `json:"size"`
+	Serial string   `json:"serial"`
+	Model  string   `json:"model"`
+	Tran   string   `json:"tran"`
+	Hctl   string   `json:"hctl"`
+	LogSec jsonInt64 `json:"log-sec"`
+	PhySec jsonInt64 `json:"phy-sec"`
 }

 type lsblkRoot struct {
@@ -101,7 +185,7 @@ func isVirtualHDiskModel(model string) bool {

 func lsblkDevices() []lsblkDevice {
 	out, err := exec.Command("lsblk", "-J", "-d",
-		"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
+		"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL,LOG-SEC,PHY-SEC").Output()
 	if err != nil {
 		slog.Warn("storage: lsblk failed", "err", err)
 		return nil
@@ -208,6 +292,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 	present := true
 	s := schema.HardwareStorage{Present: &present}
 	s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
+	applyStorageBlockGeometry(&s, dev)

 	tran := strings.ToLower(dev.Tran)
 	devPath := "/dev/" + dev.Name
@@ -250,6 +335,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 	}

 	var info smartctlInfo
+	var raw map[string]any
+	_ = json.Unmarshal(out, &raw)
 	if err := json.Unmarshal(out, &info); err == nil {
 		if v := cleanDMIValue(info.ModelName); v != "" {
 			s.Model = &v
@@ -302,8 +389,11 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 				value := float64(attr.Raw.Value)
 				s.LifeRemainingPct = &value
 			case 241:
-				value := attr.Raw.Value
+				value := smartLBAsToBytes(attr.Raw.Value)
 				s.WrittenBytes = &value
+			case 242:
+				value := smartLBAsToBytes(attr.Raw.Value)
+				s.ReadBytes = &value
 			case 197:
 				pending = attr.Raw.Value
 				s.CurrentPendingSectors = &pending
@@ -321,6 +411,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 			offlineUncorrectable: uncorrectable,
 			lifeRemainingPct:     lifeRemaining,
 		}
+		applySCSISmartctlTelemetry(&s, raw, &status)
+		applySCSIProtectionBlockGeometry(&s, devPath)
 		setStorageHealthStatus(&s, status)
 		return s
 	}
@@ -332,20 +424,23 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 }

 // nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
+// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
+// so all numeric fields use jsonInt64 which accepts both bare numbers and
+// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
 type nvmeSmartLog struct {
-	CriticalWarning  int   `json:"critical_warning"`
-	PercentageUsed   int   `json:"percentage_used"`
-	AvailableSpare   int   `json:"available_spare"`
-	SpareThreshold   int   `json:"spare_thresh"`
-	Temperature      int64 `json:"temperature"`
-	PowerOnHours     int64 `json:"power_on_hours"`
-	PowerCycles      int64 `json:"power_cycles"`
-	UnsafeShutdowns  int64 `json:"unsafe_shutdowns"`
-	DataUnitsRead    int64 `json:"data_units_read"`
-	DataUnitsWritten int64 `json:"data_units_written"`
-	ControllerBusy   int64 `json:"controller_busy_time"`
-	MediaErrors      int64 `json:"media_errors"`
-	NumErrLogEntries int64 `json:"num_err_log_entries"`
+	CriticalWarning  jsonInt64 `json:"critical_warning"`
+	PercentageUsed   jsonInt64 `json:"percent_used"`
+	AvailableSpare   jsonInt64 `json:"avail_spare"`
+	SpareThreshold   jsonInt64 `json:"spare_thresh"`
+	Temperature      jsonInt64 `json:"temperature"`
+	PowerOnHours     jsonInt64 `json:"power_on_hours"`
+	PowerCycles      jsonInt64 `json:"power_cycles"`
+	UnsafeShutdowns  jsonInt64 `json:"unsafe_shutdowns"`
+	DataUnitsRead    jsonInt64 `json:"data_units_read"`
+	DataUnitsWritten jsonInt64 `json:"data_units_written"`
+	ControllerBusy   jsonInt64 `json:"controller_busy_time"`
+	MediaErrors      jsonInt64 `json:"media_errors"`
+	NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
 }

 // nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
@@ -368,6 +463,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
 		Interface:               &iface,
 		Telemetry:               map[string]any{"linux_device": "/dev/" + dev.Name},
 	}
+	applyStorageBlockGeometry(&s, dev)

 	devPath := "/dev/" + dev.Name
 	if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
@@ -402,19 +498,23 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
 			}
 		}
 	}
+	applyNVMeBlockGeometry(&s, devPath)

 	// smart-log: wear telemetry
 	if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
 		var log nvmeSmartLog
 		if json.Unmarshal(out, &log) == nil {
 			if log.PowerOnHours > 0 {
-				s.PowerOnHours = &log.PowerOnHours
+				v := int64(log.PowerOnHours)
+				s.PowerOnHours = &v
 			}
 			if log.PowerCycles > 0 {
-				s.PowerCycles = &log.PowerCycles
+				v := int64(log.PowerCycles)
+				s.PowerCycles = &v
 			}
 			if log.UnsafeShutdowns > 0 {
-				s.UnsafeShutdowns = &log.UnsafeShutdowns
+				v := int64(log.UnsafeShutdowns)
+				s.UnsafeShutdowns = &v
 			}
 			if log.PercentageUsed > 0 {
 				v := float64(log.PercentageUsed)
@@ -423,11 +523,11 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
 				s.LifeRemainingPct = &remaining
 			}
 			if log.DataUnitsWritten > 0 {
-				v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
+				v := nvmeDataUnitsToBytes(int64(log.DataUnitsWritten))
 				s.WrittenBytes = &v
 			}
 			if log.DataUnitsRead > 0 {
-				v := nvmeDataUnitsToBytes(log.DataUnitsRead)
+				v := nvmeDataUnitsToBytes(int64(log.DataUnitsRead))
 				s.ReadBytes = &v
 			}
 			if log.AvailableSpare > 0 {
@@ -435,23 +535,25 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
 				s.AvailableSparePct = &v
 			}
 			if log.MediaErrors > 0 {
-				s.MediaErrors = &log.MediaErrors
+				v := int64(log.MediaErrors)
+				s.MediaErrors = &v
 			}
 			if log.NumErrLogEntries > 0 {
-				s.ErrorLogEntries = &log.NumErrLogEntries
+				v := int64(log.NumErrLogEntries)
+				s.ErrorLogEntries = &v
 			}
 			if log.Temperature > 0 {
 				v := float64(log.Temperature - 273)
 				s.TemperatureC = &v
 			}
 			setStorageHealthStatus(&s, storageHealthStatus{
-				criticalWarning: log.CriticalWarning,
+				criticalWarning: int(log.CriticalWarning),
 				percentageUsed:  int64(log.PercentageUsed),
 				availableSpare:  int64(log.AvailableSpare),
 				spareThreshold:  int64(log.SpareThreshold),
-				unsafeShutdowns: log.UnsafeShutdowns,
-				mediaErrors:     log.MediaErrors,
-				errorLogEntries: log.NumErrLogEntries,
+				unsafeShutdowns: int64(log.UnsafeShutdowns),
+				mediaErrors:     int64(log.MediaErrors),
+				errorLogEntries: int64(log.NumErrLogEntries),
 			})
 			return s
 		}
@@ -477,6 +579,251 @@ func nvmeDataUnitsToBytes(units int64) int64 {
 	return units * 512000
 }

+func smartLBAsToBytes(lbas int64) int64 {
+	if lbas <= 0 {
+		return 0
+	}
+	return lbas * 512
+}
+
+func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, status *storageHealthStatus) {
+	if s == nil || len(raw) == 0 {
+		return
+	}
+	if v, ok := firstInt64(raw,
+		"path:power_on_time.hours",
+		"path:accumulated_power_on_time.hours",
+		"path:power_on_time.hour",
+		"path:accumulated_power_on_time.hour",
+	); ok && v > 0 && s.PowerOnHours == nil {
+		s.PowerOnHours = &v
+	}
+	if v, ok := firstInt64(raw,
+		"path:power_cycle_count",
+		"path:start_stop_cycle_count",
+		"path:accumulated_start_stop_cycles",
+	); ok && v > 0 && s.PowerCycles == nil {
+		s.PowerCycles = &v
+	}
+	if v, ok := firstInt64(raw,
+		"path:scsi_grown_defect_list",
+		"path:grown_defect_list",
+	); ok && v > 0 && s.ReallocatedSectors == nil {
+		s.ReallocatedSectors = &v
+		if status != nil && status.reallocatedSectors == 0 {
+			status.reallocatedSectors = v
+		}
+	}
+	if v, ok := firstInt64(raw,
+		"path:percentage_used_endurance_indicator",
+		"path:scsi_percentage_used_endurance_indicator",
+	); ok && v > 0 {
+		if s.LifeUsedPct == nil {
+			fv := float64(v)
+			s.LifeUsedPct = &fv
+		}
+		if s.LifeRemainingPct == nil && v <= 100 {
+			remaining := float64(100 - v)
+			s.LifeRemainingPct = &remaining
+			if status != nil && status.lifeRemainingPct == 0 {
+				status.lifeRemainingPct = int64(remaining)
+			}
+		}
+	}
+	blockSize, hasBlockSize := firstInt64(raw,
+		"path:logical_block_size",
+		"path:block_size",
+		"path:user_capacity.block_size",
+	)
+	if hasBlockSize && blockSize > 0 {
+		if s.LogicalBlockSizeBytes == nil {
+			s.LogicalBlockSizeBytes = &blockSize
+		}
+		if s.MetadataBytesPerBlock == nil {
+			zero := int64(0)
+			s.MetadataBytesPerBlock = &zero
+		}
+		if s.Telemetry == nil {
+			s.Telemetry = map[string]any{}
+		}
+		s.Telemetry["logical_block_size_bytes"] = *s.LogicalBlockSizeBytes
+		s.Telemetry["metadata_bytes_per_block"] = *s.MetadataBytesPerBlock
+		s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
+		if v, ok := firstInt64(raw,
+			"path:logical_blocks_written",
+			"path:total_lbas_written",
+		); ok && v > 0 && s.WrittenBytes == nil {
+			bytes := v * blockSize
+			s.WrittenBytes = &bytes
+		}
+		if v, ok := firstInt64(raw,
+			"path:logical_blocks_read",
+			"path:total_lbas_read",
+		); ok && v > 0 && s.ReadBytes == nil {
+			bytes := v * blockSize
+			s.ReadBytes = &bytes
+		}
+	}
+}
+
+func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
+	if s == nil {
+		return
+	}
+	logical := int64(dev.LogSec)
+	physical := int64(dev.PhySec)
+	if logical <= 0 && physical <= 0 {
+		return
+	}
+	if s.Telemetry == nil {
+		s.Telemetry = map[string]any{}
+	}
+	if logical > 0 {
+		s.LogicalBlockSizeBytes = &logical
+		s.Telemetry["logical_block_size_bytes"] = logical
+		if s.MetadataBytesPerBlock == nil {
+			zero := int64(0)
+			s.MetadataBytesPerBlock = &zero
+			s.Telemetry["metadata_bytes_per_block"] = zero
+		}
+	}
+	if physical > 0 {
+		s.PhysicalBlockSizeBytes = &physical
+		s.Telemetry["physical_block_size_bytes"] = physical
+	}
+	if s.LogicalBlockSizeBytes != nil && s.MetadataBytesPerBlock != nil {
+		s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
+	}
+}
+
+func applyNVMeBlockGeometry(s *schema.HardwareStorage, devPath string) {
+	if s == nil || strings.TrimSpace(devPath) == "" {
+		return
+	}
+	out, err := exec.Command("nvme", "id-ns", devPath, "-H").CombinedOutput()
+	if err != nil {
+		return
+	}
+	dataBytes, metadataBytes, ok := parseNVMeBlockFormat(string(out))
+	if !ok {
+		return
+	}
+	setStorageBlockGeometry(s, dataBytes, metadataBytes)
+}
+
+func applySCSIProtectionBlockGeometry(s *schema.HardwareStorage, devPath string) {
+	if s == nil || strings.TrimSpace(devPath) == "" {
+		return
+	}
+	out, err := exec.Command("sg_readcap", "-l", devPath).CombinedOutput()
+	if err != nil {
+		return
+	}
+	dataBytes, metadataBytes, ok := parseSCSIBlockFormat(string(out))
+	if !ok {
+		return
+	}
+	setStorageBlockGeometry(s, dataBytes, metadataBytes)
+}
+
+func setStorageBlockGeometry(s *schema.HardwareStorage, dataBytes, metadataBytes int64) {
+	if s == nil || dataBytes <= 0 || metadataBytes < 0 {
+		return
+	}
+	if s.Telemetry == nil {
+		s.Telemetry = map[string]any{}
+	}
+	s.LogicalBlockSizeBytes = &dataBytes
+	s.MetadataBytesPerBlock = &metadataBytes
+	s.Telemetry["logical_block_size_bytes"] = dataBytes
+	s.Telemetry["metadata_bytes_per_block"] = metadataBytes
+	s.Telemetry["block_format"] = formatBlockFormat(dataBytes, metadataBytes)
+}
+
+func formatBlockFormat(dataBytes, metadataBytes int64) string {
+	return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
+}
+
+func parseNVMeBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
+	if m := nvmeLBAFCompactRE.FindStringSubmatch(raw); len(m) == 3 {
+		ms, errMS := strconv.ParseInt(m[1], 10, 64)
+		lbads, errLBADS := strconv.ParseInt(m[2], 10, 64)
+		if errMS == nil && errLBADS == nil && lbads >= 0 && lbads < 63 {
+			return 1 << lbads, ms, true
+		}
+	}
+	if m := nvmeLBAFVerboseRE.FindStringSubmatch(raw); len(m) == 3 {
+		ms, errMS := strconv.ParseInt(m[1], 10, 64)
+		ds, errDS := strconv.ParseInt(m[2], 10, 64)
+		if errMS == nil && errDS == nil && ds > 0 {
+			return ds, ms, true
+		}
+	}
+	return 0, 0, false
+}
+
+func parseSCSIBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
+	m := sgReadcapBlockRE.FindStringSubmatch(raw)
+	if len(m) != 2 {
+		return 0, 0, false
+	}
+	blockBytes, err := strconv.ParseInt(m[1], 10, 64)
+	if err != nil || blockBytes <= 0 {
+		return 0, 0, false
+	}
+	if sgReadcapProtRE.MatchString(raw) {
+		return blockBytes, 8, true
+	}
+	return blockBytes, 0, true
+}
+
+func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
+	for _, candidate := range candidates {
+		if !strings.HasPrefix(candidate, "path:") {
+			continue
+		}
+		path := strings.TrimPrefix(candidate, "path:")
+		if v, ok := nestedInt64(root, strings.Split(path, ".")); ok {
+			return v, true
+		}
+	}
+	return 0, false
+}
+
+func nestedInt64(root map[string]any, path []string) (int64, bool) {
+	var current any = root
+	for _, key := range path {
+		obj, ok := current.(map[string]any)
+		if !ok {
+			return 0, false
+		}
+		current, ok = obj[key]
+		if !ok {
+			return 0, false
+		}
+	}
+	switch v := current.(type) {
+	case float64:
+		return int64(v), true
+	case float32:
+		return int64(v), true
+	case int:
+		return int64(v), true
+	case int64:
+		return v, true
+	case int32:
+		return int64(v), true
+	case json.Number:
+		n, err := v.Int64()
+		return n, err == nil
+	case string:
+		n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
+		return n, err == nil
+	default:
+		return 0, false
+	}
+}
+
 type storageHealthStatus struct {
 	hasOverall           bool
 	overallPassed        bool
--- a/audit/internal/collector/storage_block_format_test.go
+++ b/audit/internal/collector/storage_block_format_test.go
@@ -0,0 +1,69 @@
+package collector
+
+import "testing"
+
+func TestParseNVMeBlockFormatCompact(t *testing.T) {
+	t.Parallel()
+
+	raw := `
+lbaf  0 : ms:0   lbads:9  rp:0x2 (in use)
+lbaf  1 : ms:8   lbads:9  rp:0x1
+`
+	dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
+	if !ok {
+		t.Fatal("parseNVMeBlockFormat returned ok=false")
+	}
+	if dataBytes != 512 || metadataBytes != 0 {
+		t.Fatalf("got %d+%d want 512+0", dataBytes, metadataBytes)
+	}
+}
+
+func TestParseNVMeBlockFormatVerbose(t *testing.T) {
+	t.Parallel()
+
+	raw := `
+LBA Format 0 : Metadata Size: 8 bytes - Data Size: 512 bytes - Relative Performance: 0 Better (in use)
+LBA Format 1 : Metadata Size: 0 bytes - Data Size: 4096 bytes - Relative Performance: 1 Best
+`
+	dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
+	if !ok {
+		t.Fatal("parseNVMeBlockFormat returned ok=false")
+	}
+	if dataBytes != 512 || metadataBytes != 8 {
+		t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
+	}
+}
+
+func TestParseSCSIBlockFormatWithProtection(t *testing.T) {
+	t.Parallel()
+
+	raw := `
+Read Capacity results:
+   Protection: prot_en=1, p_type=1, p_i_exponent=0
+   Logical block length=512 bytes
+`
+	dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
+	if !ok {
+		t.Fatal("parseSCSIBlockFormat returned ok=false")
+	}
+	if dataBytes != 512 || metadataBytes != 8 {
+		t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
+	}
+}
+
+func TestParseSCSIBlockFormatWithoutProtection(t *testing.T) {
+	t.Parallel()
+
+	raw := `
+Read Capacity results:
+   Protection: prot_en=0, p_type=0, p_i_exponent=0
+   Logical block length=4096 bytes
+`
+	dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
+	if !ok {
+		t.Fatal("parseSCSIBlockFormat returned ok=false")
+	}
+	if dataBytes != 4096 || metadataBytes != 0 {
+		t.Fatalf("got %d+%d want 4096+0", dataBytes, metadataBytes)
+	}
+}
--- a/audit/internal/collector/storage_discovery_test.go
+++ b/audit/internal/collector/storage_discovery_test.go
@@ -1,6 +1,13 @@
 package collector

-import "testing"
+import (
+	"encoding/json"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+)

 func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
 	t.Parallel()
@@ -31,3 +38,130 @@ func TestParseStorageBytes(t *testing.T) {
 		t.Fatalf("parseStorageBytes invalid=%d want 0", got)
 	}
 }
+
+func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
+	t.Parallel()
+
+	// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
+	// Older versions emit quoted strings. Both must parse without error
+	// so that the entire lsblkDevices() call does not return nil on Debian 12.
+	cases := []struct {
+		json string
+		want int64
+	}{
+		{`512`, 512},
+		{`4096`, 4096},
+		{`"512"`, 512},
+		{`"4096"`, 4096},
+		{`null`, 0},
+	}
+	for _, tc := range cases {
+		var v jsonInt64
+		if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
+			t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
+		}
+		if int64(v) != tc.want {
+			t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
+		}
+	}
+
+	// Simulate the exact JSON shape that triggered the bug on Debian 12.
+	input := []byte(`{
+		"blockdevices": [
+			{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
+			{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
+		]
+	}`)
+	var root lsblkRoot
+	if err := json.Unmarshal(input, &root); err != nil {
+		t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
+	}
+	if len(root.Blockdevices) != 2 {
+		t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
+	}
+	if int64(root.Blockdevices[0].LogSec) != 512 {
+		t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
+	}
+	if int64(root.Blockdevices[0].PhySec) != 4096 {
+		t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
+	}
+}
+
+func TestBestEffortRescanHotplugStorage(t *testing.T) {
+	t.Parallel()
+
+	tmp := t.TempDir()
+	rescanPath := filepath.Join(tmp, "pci-rescan")
+	scanDir := filepath.Join(tmp, "scsi_host")
+	host0Path := filepath.Join(scanDir, "host0", "scan")
+	host1Path := filepath.Join(scanDir, "host1", "scan")
+	argsPath := filepath.Join(tmp, "udevadm-args")
+	toolPath := filepath.Join(tmp, "udevadm")
+	if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
+		t.Fatalf("mkdir host0: %v", err)
+	}
+	if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
+		t.Fatalf("mkdir host1: %v", err)
+	}
+	if err := os.WriteFile(host0Path, nil, 0644); err != nil {
+		t.Fatalf("touch host0 scan: %v", err)
+	}
+	if err := os.WriteFile(host1Path, nil, 0644); err != nil {
+		t.Fatalf("touch host1 scan: %v", err)
+	}
+	script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
+	if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
+		t.Fatalf("write udevadm stub: %v", err)
+	}
+
+	oldPath := os.Getenv("PATH")
+	if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
+		t.Fatalf("set PATH: %v", err)
+	}
+	defer func() { _ = os.Setenv("PATH", oldPath) }()
+
+	oldRescanPath := pciRescanPath
+	oldSCSIGlob := scsiHostScanGlob
+	oldWriteFile := hotplugWriteFile
+	oldExecCommand := hotplugExecCommand
+	oldGlob := hotplugGlob
+	pciRescanPath = rescanPath
+	scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
+	hotplugWriteFile = os.WriteFile
+	hotplugExecCommand = exec.Command
+	hotplugGlob = filepath.Glob
+	defer func() {
+		pciRescanPath = oldRescanPath
+		scsiHostScanGlob = oldSCSIGlob
+		hotplugWriteFile = oldWriteFile
+		hotplugExecCommand = oldExecCommand
+		hotplugGlob = oldGlob
+	}()
+
+	bestEffortRescanHotplugStorage()
+
+	raw, err := os.ReadFile(rescanPath)
+	if err != nil {
+		t.Fatalf("read rescan file: %v", err)
+	}
+	if string(raw) != "1\n" {
+		t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
+	}
+	for _, path := range []string{host0Path, host1Path} {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			t.Fatalf("read scsi scan file %s: %v", path, err)
+		}
+		if string(raw) != "- - -\n" {
+			t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
+		}
+	}
+
+	args, err := os.ReadFile(argsPath)
+	if err != nil {
+		t.Fatalf("read udevadm args: %v", err)
+	}
+	if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
+		t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
+	}
+}
--- a/audit/internal/collector/storage_health_test.go
+++ b/audit/internal/collector/storage_health_test.go
@@ -1,11 +1,65 @@
 package collector

 import (
+	"encoding/json"
 	"testing"

 	"bee/audit/internal/schema"
 )

+// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
+// counters are quoted strings and field names differ from NVMe spec prose)
+// is correctly parsed into nvmeSmartLog.
+func TestNVMeSmartLogUnmarshal(t *testing.T) {
+	t.Parallel()
+
+	// Real nvme-cli output: counters are JSON strings, spare is "avail_spare",
+	// percentage used is "percent_used".
+	raw := `{
+		"critical_warning": 0,
+		"temperature": 310,
+		"avail_spare": 100,
+		"spare_thresh": 5,
+		"percent_used": 0,
+		"data_units_read": "10925415",
+		"data_units_written": "8497672",
+		"controller_busy_time": "305",
+		"power_cycles": "53",
+		"power_on_hours": "49",
+		"unsafe_shutdowns": "22",
+		"media_errors": "0",
+		"num_err_log_entries": "0"
+	}`
+	var log nvmeSmartLog
+	if err := json.Unmarshal([]byte(raw), &log); err != nil {
+		t.Fatalf("json.Unmarshal failed: %v", err)
+	}
+	if log.PowerOnHours != 49 {
+		t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours)
+	}
+	if log.PowerCycles != 53 {
+		t.Errorf("PowerCycles=%d want 53", log.PowerCycles)
+	}
+	if log.AvailableSpare != 100 {
+		t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare)
+	}
+	if log.SpareThreshold != 5 {
+		t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold)
+	}
+	if log.PercentageUsed != 0 {
+		t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed)
+	}
+	if log.Temperature != 310 {
+		t.Errorf("Temperature=%d want 310", log.Temperature)
+	}
+	if log.MediaErrors != 0 {
+		t.Errorf("MediaErrors=%d want 0", log.MediaErrors)
+	}
+	if log.UnsafeShutdowns != 22 {
+		t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns)
+	}
+}
+
 func TestSetStorageHealthStatus(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/collector/storage_scsi_test.go
+++ b/audit/internal/collector/storage_scsi_test.go
@@ -0,0 +1,101 @@
+package collector
+
+import (
+	"testing"
+
+	"bee/audit/internal/schema"
+)
+
+func TestApplySCSISmartctlTelemetry(t *testing.T) {
+	t.Parallel()
+
+	raw := map[string]any{
+		"power_on_time": map[string]any{
+			"hours": float64(32123),
+		},
+		"accumulated_start_stop_cycles":       float64(17),
+		"scsi_grown_defect_list":              float64(4),
+		"percentage_used_endurance_indicator": float64(12),
+		"logical_block_size":                  float64(4096),
+		"logical_blocks_written":              float64(1000),
+		"logical_blocks_read":                 float64(2000),
+	}
+
+	var disk schema.HardwareStorage
+	status := storageHealthStatus{}
+	applySCSISmartctlTelemetry(&disk, raw, &status)
+
+	if disk.PowerOnHours == nil || *disk.PowerOnHours != 32123 {
+		t.Fatalf("power_on_hours=%v want 32123", disk.PowerOnHours)
+	}
+	if disk.PowerCycles == nil || *disk.PowerCycles != 17 {
+		t.Fatalf("power_cycles=%v want 17", disk.PowerCycles)
+	}
+	if disk.ReallocatedSectors == nil || *disk.ReallocatedSectors != 4 {
+		t.Fatalf("reallocated=%v want 4", disk.ReallocatedSectors)
+	}
+	if disk.WrittenBytes == nil || *disk.WrittenBytes != 4096000 {
+		t.Fatalf("written_bytes=%v want 4096000", disk.WrittenBytes)
+	}
+	if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
+		t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
+	}
+	if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 4096 {
+		t.Fatalf("logical_block_size_bytes=%v want 4096", disk.LogicalBlockSizeBytes)
+	}
+	if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
+		t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
+	}
+	if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
+		t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
+	}
+	if disk.LifeRemainingPct == nil || *disk.LifeRemainingPct != 88 {
+		t.Fatalf("life_remaining_pct=%v want 88", disk.LifeRemainingPct)
+	}
+	if status.reallocatedSectors != 4 {
+		t.Fatalf("status.reallocated=%d want 4", status.reallocatedSectors)
+	}
+	if status.lifeRemainingPct != 88 {
+		t.Fatalf("status.life_remaining_pct=%d want 88", status.lifeRemainingPct)
+	}
+}
+
+func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T) {
+	t.Parallel()
+
+	powerOnHours := int64(10)
+	writtenBytes := int64(20)
+	lifeRemaining := 30.0
+	disk := schema.HardwareStorage{
+		PowerOnHours:     &powerOnHours,
+		WrittenBytes:     &writtenBytes,
+		LifeRemainingPct: &lifeRemaining,
+	}
+	raw := map[string]any{
+		"power_on_time":                       map[string]any{"hours": float64(999)},
+		"logical_block_size":                  float64(512),
+		"logical_blocks_written":              float64(999),
+		"percentage_used_endurance_indicator": float64(50),
+	}
+
+	applySCSISmartctlTelemetry(&disk, raw, nil)
+
+	if *disk.PowerOnHours != 10 {
+		t.Fatalf("power_on_hours overwritten: got %d want 10", *disk.PowerOnHours)
+	}
+	if *disk.WrittenBytes != 20 {
+		t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
+	}
+	if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 512 {
+		t.Fatalf("logical_block_size_bytes=%v want 512", disk.LogicalBlockSizeBytes)
+	}
+	if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
+		t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
+	}
+	if *disk.LifeRemainingPct != 30 {
+		t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
+	}
+	if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 50 {
+		t.Fatalf("life_used_pct=%v want 50", disk.LifeUsedPct)
+	}
+}
--- a/audit/internal/collector/storage_telemetry_test.go
+++ b/audit/internal/collector/storage_telemetry_test.go
@@ -0,0 +1,25 @@
+package collector
+
+import "testing"
+
+func TestSmartLBAsToBytes(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		lbas int64
+		want int64
+	}{
+		{name: "zero", lbas: 0, want: 0},
+		{name: "single lba", lbas: 1, want: 512},
+		{name: "multiple lbas", lbas: 2048, want: 1048576},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := smartLBAsToBytes(tt.lbas); got != tt.want {
+				t.Fatalf("smartLBAsToBytes(%d)=%d want %d", tt.lbas, got, tt.want)
+			}
+		})
+	}
+}
--- a/audit/internal/collector/vroc_test.go
+++ b/audit/internal/collector/vroc_test.go
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
 	}
 }

+func TestParseMDAdmPlatformLicense(t *testing.T) {
+	premium := `Platform : Intel(R) Virtual RAID on CPU
+Version  : 1.3.0.1138
+RAID Levels : raid0 raid1 raid5 raid10
+Total Disks : 4
+License  : Premium
+`
+	got := parseMDAdmPlatformLicense(premium)
+	if got == nil || *got != "premium" {
+		t.Fatalf("expected 'premium', got %v", got)
+	}
+
+	standard := `Platform : Intel(R) Virtual RAID on CPU
+License  : Standard
+`
+	got = parseMDAdmPlatformLicense(standard)
+	if got == nil || *got != "standard" {
+		t.Fatalf("expected 'standard', got %v", got)
+	}
+
+	noLicense := `Platform : Intel(R) Virtual RAID on CPU
+Version  : 1.0.0
+`
+	got = parseMDAdmPlatformLicense(noLicense)
+	if got != nil {
+		t.Fatalf("expected nil, got %v", *got)
+	}
+}
+
 func TestHasVROCController(t *testing.T) {
 	intel := vendorIntel
 	model := "Volume Management Device NVMe RAID Controller"
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_power_autotune.go
+++ b/audit/internal/platform/benchmark_power_autotune.go
@@ -0,0 +1,735 @@
+package platform
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"math"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+)
+
+const (
+	benchmarkPowerAutotuneVersion         = 1
+	benchmarkPowerAutotuneIdleSec         = 60
+	benchmarkPowerAutotuneLoadSec         = 90
+	benchmarkPowerAutotuneSampleInterval  = 3
+	defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
+)
+
+func BenchmarkPowerSourceConfigPath(baseDir string) string {
+	baseDir = strings.TrimSpace(baseDir)
+	if baseDir == "" {
+		return defaultBenchmarkPowerSourceConfigPath
+	}
+	return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
+}
+
+func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var cfg BenchmarkPowerAutotuneConfig
+	if err := json.Unmarshal(raw, &cfg); err != nil {
+		return nil, err
+	}
+	if strings.TrimSpace(cfg.SelectedSource) == "" {
+		return nil, fmt.Errorf("autotune config missing selected_source")
+	}
+	return &cfg, nil
+}
+
+func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
+	if strings.TrimSpace(path) == "" {
+		return fmt.Errorf("empty autotune config path")
+	}
+	if cfg.Version <= 0 {
+		cfg.Version = benchmarkPowerAutotuneVersion
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(cfg, "", "  ")
+	if err != nil {
+		return err
+	}
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, data, 0644); err != nil {
+		return err
+	}
+	return os.Rename(tmp, path)
+}
+
+func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
+	return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
+}
+
+func ResetBenchmarkPowerAutotuneConfig(path string) error {
+	if strings.TrimSpace(path) == "" {
+		return fmt.Errorf("empty autotune config path")
+	}
+	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
+
+func normalizeBenchmarkPowerSource(source string) string {
+	switch strings.TrimSpace(strings.ToLower(source)) {
+	case BenchmarkPowerSourceSDRPSUInput:
+		return BenchmarkPowerSourceSDRPSUInput
+	default:
+		return BenchmarkPowerSourceDCMI
+	}
+}
+
+func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
+	cfg, err := LoadSystemPowerSourceConfig(exportDir)
+	if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
+		selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
+		return SystemPowerSourceDecision{
+			Configured:      true,
+			SelectedSource:  selected,
+			EffectiveSource: selected,
+			Mode:            "autotuned",
+			Reason:          strings.TrimSpace(cfg.Reason),
+			ConfiguredAt:    cfg.UpdatedAt,
+		}
+	}
+
+	sources := sampleBenchmarkPowerSources()
+	if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
+		return SystemPowerSourceDecision{
+			Configured:      false,
+			EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
+			Mode:            "fallback",
+			Reason:          "autotune config not found; using temporary fallback source sdr_psu_input",
+		}
+	}
+	return SystemPowerSourceDecision{
+		Configured:      false,
+		EffectiveSource: BenchmarkPowerSourceDCMI,
+		Mode:            "fallback",
+		Reason:          "autotune config not found; using temporary fallback source dcmi",
+	}
+}
+
+func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
+	decision := ResolveSystemPowerDecision(exportDir)
+	if decision.EffectiveSource != "" {
+		if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
+			return value, decision, nil
+		} else if decision.Configured {
+			fallback := BenchmarkPowerSourceDCMI
+			if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
+				fallback = BenchmarkPowerSourceSDRPSUInput
+			}
+			if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
+				decision.Mode = "degraded"
+				decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
+				decision.EffectiveSource = fallback
+				return fallbackValue, decision, nil
+			}
+			decision.Mode = "degraded"
+			decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
+			return 0, decision, err
+		}
+	}
+	return 0, decision, fmt.Errorf("system power source unavailable")
+}
+
+func queryBenchmarkPowerSourceW(source string) (float64, error) {
+	switch normalizeBenchmarkPowerSource(source) {
+	case BenchmarkPowerSourceSDRPSUInput:
+		sdr := sampleIPMISDRPowerSensors()
+		if sdr.PSUInW > 0 {
+			return sdr.PSUInW, nil
+		}
+		return 0, fmt.Errorf("sdr psu input unavailable")
+	default:
+		return queryIPMIServerPowerW()
+	}
+}
+
+func sampleBenchmarkPowerSources() map[string]float64 {
+	out := map[string]float64{}
+	if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
+		out[BenchmarkPowerSourceDCMI] = w
+	}
+	if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
+		out[BenchmarkPowerSourceSDRPSUInput] = w
+	}
+	return out
+}
+
+func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
+	if durationSec <= 0 {
+		return 0, false
+	}
+	samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
+func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
+	if durationSec <= 0 {
+		return nil
+	}
+	stopCh := make(chan struct{})
+	doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
+	select {
+	case <-ctx.Done():
+	case <-time.After(time.Duration(durationSec) * time.Second):
+	}
+	close(stopCh)
+	return <-doneCh
+}
+
+func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
+	if intervalSec <= 0 {
+		intervalSec = benchmarkPowerAutotuneSampleInterval
+	}
+	ch := make(chan []float64, 1)
+	go func() {
+		defer close(ch)
+		var samples []float64
+		record := func() {
+			if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
+				samples = append(samples, w)
+			}
+		}
+		record()
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- samples
+				return
+			case <-ticker.C:
+				record()
+			}
+		}
+	}()
+	return ch
+}
+
+type benchmarkPowerAutotuneSample struct {
+	ElapsedSec     float64
+	GPUAvgUsagePct float64
+	CPUUsagePct    float64
+	GPUSumPowerW   float64
+	Sources        map[string]float64
+}
+
+func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
+	if durationSec <= 0 {
+		return nil
+	}
+	var out []benchmarkPowerAutotuneSample
+	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
+	start := time.Now()
+	for {
+		if ctx.Err() != nil {
+			return out
+		}
+		row := benchmarkPowerAutotuneSample{
+			ElapsedSec:  time.Since(start).Seconds(),
+			CPUUsagePct: sampleCPULoadPct(),
+			Sources:     sampleBenchmarkPowerSources(),
+		}
+		if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
+			var usageSum float64
+			for _, gpu := range gpuRows {
+				row.GPUSumPowerW += gpu.PowerW
+				usageSum += gpu.UsagePct
+			}
+			row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
+		}
+		out = append(out, row)
+		logBenchmarkPowerAutotuneSample(phase, row, logFunc)
+		if time.Now().After(deadline) {
+			return out
+		}
+		select {
+		case <-ctx.Done():
+			return out
+		case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
+		}
+	}
+}
+
+func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
+	if logFunc == nil {
+		return
+	}
+	var sourceParts []string
+	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
+		if value, ok := sample.Sources[source]; ok && value > 0 {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
+		} else {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
+		}
+	}
+	logFunc(fmt.Sprintf(
+		"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
+		phase,
+		sample.ElapsedSec,
+		sample.GPUAvgUsagePct,
+		sample.GPUSumPowerW,
+		sample.CPUUsagePct,
+		strings.Join(sourceParts, " "),
+	))
+}
+
+func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
+	if logFunc == nil || len(samples) == 0 {
+		return
+	}
+	var gpuUsage []float64
+	var cpuUsage []float64
+	var gpuPower []float64
+	sourceBuckets := map[string][]float64{}
+	for _, sample := range samples {
+		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
+		cpuUsage = append(cpuUsage, sample.CPUUsagePct)
+		gpuPower = append(gpuPower, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				sourceBuckets[source] = append(sourceBuckets[source], value)
+			}
+		}
+	}
+	var sourceParts []string
+	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
+		values := sourceBuckets[source]
+		if len(values) == 0 {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
+			continue
+		}
+		sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
+	}
+	logFunc(fmt.Sprintf(
+		"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
+		phase,
+		len(samples),
+		benchmarkMean(gpuUsage),
+		benchmarkPercentile(gpuUsage, 95),
+		benchmarkMean(gpuPower),
+		benchmarkMean(cpuUsage),
+		benchmarkPercentile(cpuUsage, 95),
+		strings.Join(sourceParts, " "),
+	))
+}
+
+func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
+	if logFunc == nil {
+		return
+	}
+	for _, candidate := range candidates {
+		if !candidate.Available {
+			logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
+			continue
+		}
+		logFunc(fmt.Sprintf(
+			"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
+			candidate.Source,
+			candidate.IdleAvgW,
+			candidate.LoadAvgW,
+			candidate.DeltaW,
+			gpuDelta,
+			candidate.RelativeError,
+			candidate.Confidence*100,
+			map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
+		))
+		if strings.TrimSpace(candidate.SelectionNotes) != "" {
+			logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
+		}
+	}
+}
+
+func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
+	result := &BenchmarkPowerAutotuneValidation{}
+	if len(samples) == 0 {
+		result.Reason = "no idle telemetry samples collected"
+		return result
+	}
+	var gpuUsage []float64
+	var cpuUsage []float64
+	for _, sample := range samples {
+		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
+		if sample.CPUUsagePct > 0 {
+			cpuUsage = append(cpuUsage, sample.CPUUsagePct)
+		}
+	}
+	result.GPUSamples = len(gpuUsage)
+	result.CPUSamples = len(cpuUsage)
+	result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
+	result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
+	result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
+	result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
+	switch {
+	case result.GPUAvgUsagePct > 5:
+		result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
+	case result.GPUP95UsagePct > 10:
+		result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
+	case result.CPUAvgUsagePct > 20:
+		result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
+	case result.CPUP95UsagePct > 35:
+		result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
+	default:
+		result.Valid = true
+	}
+	return result
+}
+
+func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
+	idleBySource := map[string][]float64{}
+	loadBySource := map[string][]float64{}
+	var idleGPU []float64
+	var loadGPU []float64
+	for _, sample := range idle {
+		idleGPU = append(idleGPU, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				idleBySource[source] = append(idleBySource[source], value)
+			}
+		}
+	}
+	for _, sample := range load {
+		loadGPU = append(loadGPU, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				loadBySource[source] = append(loadBySource[source], value)
+			}
+		}
+	}
+	idleGPUAvg := benchmarkMean(idleGPU)
+	loadGPUAvg := benchmarkMean(loadGPU)
+	gpuDelta := loadGPUAvg - idleGPUAvg
+	if gpuDelta <= 0 {
+		gpuDelta = loadGPUAvg
+	}
+
+	candidates := []BenchmarkPowerAutotuneCandidate{
+		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
+		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
+	}
+	available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
+	for _, candidate := range candidates {
+		if candidate.Available && candidate.DeltaW > 0 {
+			available = append(available, candidate)
+		}
+	}
+	if len(available) == 0 {
+		return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
+	}
+	sort.Slice(available, func(i, j int) bool {
+		if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
+			if available[i].Source != available[j].Source {
+				return available[i].Source == BenchmarkPowerSourceSDRPSUInput
+			}
+		}
+		if available[i].RelativeError != available[j].RelativeError {
+			return available[i].RelativeError < available[j].RelativeError
+		}
+		return available[i].Samples > available[j].Samples
+	})
+	selected := available[0]
+	for idx := range candidates {
+		if candidates[idx].Source == selected.Source {
+			candidates[idx].Selected = true
+			candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
+		}
+	}
+	return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
+}
+
+func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
+	candidate := BenchmarkPowerAutotuneCandidate{
+		Source:    source,
+		Available: len(idle) > 0 && len(load) > 0,
+		Samples:   minInt(len(idle), len(load)),
+	}
+	if !candidate.Available {
+		return candidate
+	}
+	candidate.IdleAvgW = benchmarkMean(idle)
+	candidate.LoadAvgW = benchmarkMean(load)
+	candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
+	if gpuDelta > 0 {
+		candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
+		candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
+	}
+	return candidate
+}
+
+func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
+	fmt.Fprintf(&b, "status=%s\n", result.Status)
+	fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
+	fmt.Fprintf(&b, "profile=%s\n", result.Profile)
+	fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
+	fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
+	fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
+	if result.SelectedSource != "" {
+		fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
+	}
+	if result.IdleValidation != nil {
+		fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
+		fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
+		fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
+		fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
+		fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
+		if result.IdleValidation.Reason != "" {
+			fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
+		}
+	}
+	for _, candidate := range result.Candidates {
+		fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
+		if candidate.Available {
+			fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
+			fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
+			fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
+			fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
+		}
+	}
+	return b.String()
+}
+
+func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
+	var b strings.Builder
+	b.WriteString("# Bee Bench Power Source Autotune\n\n")
+	fmt.Fprintf(&b, "**Status:** %s  \n", result.Status)
+	fmt.Fprintf(&b, "**Benchmark kind:** %s  \n", result.BenchmarkKind)
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.Profile)
+	fmt.Fprintf(&b, "**Idle window:** %ds  \n", result.IdleDurationSec)
+	fmt.Fprintf(&b, "**Load window:** %ds  \n", result.LoadDurationSec)
+	fmt.Fprintf(&b, "**Sample interval:** %ds  \n", result.SampleIntervalSec)
+	if result.SelectedSource != "" {
+		fmt.Fprintf(&b, "**Selected source:** `%s`  \n", result.SelectedSource)
+	}
+	b.WriteString("\n")
+	if result.IdleValidation != nil {
+		b.WriteString("## Idle Validation\n\n")
+		fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
+		fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
+		fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
+		fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
+		fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
+		if result.IdleValidation.Reason != "" {
+			fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
+		}
+		b.WriteString("\n")
+	}
+	if len(result.Candidates) > 0 {
+		b.WriteString("## Candidates\n\n")
+		b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
+		b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
+		for _, candidate := range result.Candidates {
+			if !candidate.Available {
+				fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
+				continue
+			}
+			selected := "no"
+			if candidate.Selected {
+				selected = "yes"
+			}
+			fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
+				candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
+		}
+		b.WriteString("\n")
+	}
+	for _, note := range result.Notes {
+		fmt.Fprintf(&b, "- %s\n", note)
+	}
+	return b.String()
+}
+
+func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
+	allDevices := joinIndexList(gpuIndices)
+	switch strings.TrimSpace(strings.ToLower(kind)) {
+	case "power-fit", "power", "nvidia-bench-power":
+		cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
+		if err == nil {
+			return cmd, "power-fit"
+		}
+		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
+	default:
+		cmd := []string{
+			"bee-gpu-burn",
+			"--seconds", fmt.Sprintf("%d", durationSec),
+			"--devices", allDevices,
+		}
+		if sizeMB > 0 {
+			cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
+		}
+		return cmd, "performance"
+	}
+}
+
+func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = "/var/log/bee-bench/autotune"
+	}
+	if err := os.MkdirAll(baseDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
+	}
+	selected, err := resolveNvidiaGPUSelection(nil, nil)
+	if err != nil {
+		return "", err
+	}
+	if len(selected) == 0 {
+		return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
+	}
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "autotune-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
+	}
+	verboseLog := filepath.Join(runDir, "verbose.log")
+	hostname, _ := os.Hostname()
+	loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
+	result := BenchmarkPowerAutotuneResult{
+		GeneratedAt:       time.Now().UTC(),
+		Hostname:          hostname,
+		ServerModel:       readServerModel(),
+		BenchmarkKind:     normalizedKind,
+		Profile:           opts.Profile,
+		Status:            "FAILED",
+		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
+		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+	}
+
+	logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
+	idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
+	logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
+	result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
+	if result.IdleValidation == nil || !result.IdleValidation.Valid {
+		if result.IdleValidation != nil {
+			result.IdleValidationError = result.IdleValidation.Reason
+			logFunc(result.IdleValidation.Reason)
+		}
+		result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, fmt.Errorf("%s", result.IdleValidationError)
+	}
+
+	logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
+	loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
+	go func() {
+		loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
+	}()
+	out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
+	loadSamples := <-loadSamplesCh
+	logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
+	if runErr != nil {
+		result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, fmt.Errorf("autotune load stage: %w", runErr)
+	}
+
+	selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
+	result.Candidates = candidates
+	result.GPUPowerIdleW = idleGPUAvg
+	result.GPUPowerLoadW = loadGPUAvg
+	if chooseErr != nil {
+		result.Notes = append(result.Notes, chooseErr.Error())
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, chooseErr
+	}
+	gpuDelta := loadGPUAvg - idleGPUAvg
+	if gpuDelta <= 0 {
+		gpuDelta = loadGPUAvg
+	}
+	logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
+	result.SelectedSource = selectedSource
+	result.Status = "OK"
+	var confidence float64
+	selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
+	for _, candidate := range candidates {
+		if candidate.Selected {
+			confidence = candidate.Confidence
+			if strings.TrimSpace(candidate.SelectionNotes) != "" {
+				selectionReason = candidate.SelectionNotes
+			}
+			break
+		}
+	}
+	cfg := BenchmarkPowerAutotuneConfig{
+		Version:           benchmarkPowerAutotuneVersion,
+		UpdatedAt:         time.Now().UTC(),
+		SelectedSource:    selectedSource,
+		BenchmarkKind:     normalizedKind,
+		Profile:           opts.Profile,
+		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
+		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+		Confidence:        confidence,
+		Reason:            selectionReason,
+	}
+	result.Config = &cfg
+	configPath := BenchmarkPowerSourceConfigPath(baseDir)
+	if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
+		result.Status = "FAILED"
+		result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
+		if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
+			return "", writeErr
+		}
+		return runDir, err
+	}
+	logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
+	result.Notes = append(result.Notes, "saved autotune config to "+configPath)
+	if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+		return "", err
+	}
+	return runDir, nil
+}
+
+func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
+	resultJSON, err := json.MarshalIndent(result, "", "  ")
+	if err != nil {
+		return fmt.Errorf("marshal autotune result: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
+		return fmt.Errorf("write autotune result.json: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
+		return fmt.Errorf("write autotune summary.txt: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
+		return fmt.Errorf("write autotune report.md: %w", err)
+	}
+	return nil
+}
+
+func minInt(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+var _ = exec.ErrNotFound
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
+	if result.PlatformPowerScore > 0 {
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n", result.PlatformPowerScore)
+	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")

@@ -81,69 +84,164 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 	}

-	// ── Methodology ───────────────────────────────────────────────────────────
-	b.WriteString("## Methodology\n\n")
-	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
-	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
-	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
-	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
-	b.WriteString("**Compute score** is derived from two phases:\n\n")
-	b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
-	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
-	b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
-	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
-	b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
-	b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
-	b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
-	b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
-	b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
-	b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
+	// ── Balanced Scorecard ────────────────────────────────────────────────────
+	b.WriteString("## Balanced Scorecard\n\n")

-	// ── Scorecard table ───────────────────────────────────────────────────────
-	b.WriteString("## Scorecard\n\n")
-	b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
-	b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
-	for _, gpu := range result.GPUs {
-		name := strings.TrimSpace(gpu.Name)
-		if name == "" {
-			name = "Unknown GPU"
+	// Perspective 1: Compatibility — hard stops
+	b.WriteString("### 1. Compatibility\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			thermalThrottle := "-"
+			if gpu.Scores.ThermalThrottlePct > 0 {
+				thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+			}
+			fanAtThrottle := "-"
+			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
+				fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+			}
+			ecc := "-"
+			if gpu.ECC.Uncorrected > 0 {
+				ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
+			}
+			compatStatus := "✓ OK"
+			if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
+				compatStatus = "⛔ HARD STOP"
+			}
+			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
 		}
-		interconnect := "-"
-		if gpu.Scores.InterconnectScore > 0 {
-			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
-		}
-		topsPerSM := "-"
-		if gpu.Scores.TOPSPerSMPerGHz > 0 {
-			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
-		}
-		synthetic := "-"
-		if gpu.Scores.SyntheticScore > 0 {
-			synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
-		}
-		mixed := "-"
-		if gpu.Scores.MixedScore > 0 {
-			mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
-		}
-		mixedEff := "-"
-		if gpu.Scores.MixedEfficiency > 0 {
-			mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
-		}
-		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
-			gpu.Index, name,
-			gpu.Status,
-			gpu.Scores.CompositeScore,
-			gpu.Scores.ComputeScore,
-			synthetic,
-			mixed,
-			mixedEff,
-			topsPerSM,
-			gpu.Scores.PowerSustainScore,
-			gpu.Scores.ThermalSustainScore,
-			gpu.Scores.StabilityScore,
-			interconnect,
-		)
+		b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 2: Thermal headroom
+	b.WriteString("### 2. Thermal Headroom\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			shutdownTemp := gpu.ShutdownTempC
+			if shutdownTemp <= 0 {
+				shutdownTemp = 90
+			}
+			slowdownTemp := gpu.SlowdownTempC
+			if slowdownTemp <= 0 {
+				slowdownTemp = 80
+			}
+			headroom := gpu.Scores.TempHeadroomC
+			thermalStatus := "✓ OK"
+			switch {
+			case headroom < 10:
+				thermalStatus = "⛔ CRITICAL"
+			case gpu.Steady.P95TempC >= slowdownTemp:
+				thermalStatus = "⚠ WARNING"
+			}
+			throttlePct := "-"
+			if gpu.Scores.ThermalThrottlePct > 0 {
+				throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
+				fmt.Sprintf("%.0f°C", slowdownTemp),
+				fmt.Sprintf("%.0f°C", shutdownTemp),
+				fmt.Sprintf("%.1f°C", headroom),
+				throttlePct,
+				thermalStatus,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 3: Power delivery
+	b.WriteString("### 3. Power Delivery\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			powerCap := "-"
+			if gpu.Scores.PowerCapThrottlePct > 0 {
+				powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
+			}
+			fanDuty := "-"
+			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
+				fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+			}
+			powerStatus := "✓ OK"
+			if gpu.Scores.PowerCapThrottlePct > 5 {
+				powerStatus = "⚠ POWER LIMITED"
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				powerCap,
+				fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
+				fanDuty,
+				powerStatus,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 4: Performance
+	b.WriteString("### 4. Performance\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			synthetic := "-"
+			if gpu.Scores.SyntheticScore > 0 {
+				synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
+			}
+			mixed := "-"
+			if gpu.Scores.MixedScore > 0 {
+				mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
+			}
+			mixedEff := "-"
+			if gpu.Scores.MixedEfficiency > 0 {
+				mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
+			}
+			topsPerSM := "-"
+			if gpu.Scores.TOPSPerSMPerGHz > 0 {
+				topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
+				synthetic, mixed, mixedEff, topsPerSM,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
+		if len(result.PerformanceRampSteps) > 0 {
+			fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
+		}
+		b.WriteString("\n")
+	}
+
+	// Perspective 5: Anomaly flags
+	b.WriteString("### 5. Anomalies\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			eccCorr := "-"
+			if gpu.ECC.Corrected > 0 {
+				eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
+			}
+			syncBoost := "-"
+			if gpu.Scores.SyncBoostThrottlePct > 0 {
+				syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
+			}
+			powerVar := "OK"
+			if gpu.Scores.PowerSustainScore < 70 {
+				powerVar = "⚠ unstable"
+			}
+			thermalVar := "OK"
+			if gpu.Scores.ThermalSustainScore < 70 {
+				thermalVar = "⚠ unstable"
+			}
+			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
+		b.WriteString("\n")
 	}
-	b.WriteString("\n")

 	// ── Per GPU detail ────────────────────────────────────────────────────────
 	b.WriteString("## Per-GPU Details\n\n")
@@ -171,13 +269,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
 		if gpu.PowerLimitDerated {
-			fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
+			fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
 		}
 		if gpu.CalibratedPeakPowerW > 0 {
 			if gpu.CalibratedPeakTempC > 0 {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
 			} else {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
 			}
 		}
 		if gpu.LockedGraphicsClockMHz > 0 {
@@ -186,19 +284,27 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")

 		// Steady-state telemetry
-		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
-		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
-		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
-		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
-		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
-		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
-		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
-		b.WriteString("\n")
+		if benchmarkTelemetryAvailable(gpu.Steady) {
+			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+			b.WriteString(fmtMDTable(
+				[]string{"", "Avg", "P95"},
+				[][]string{
+					{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
+					{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
+					{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
+					{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
+					{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
+				},
+			))
+			b.WriteString("\n")
+		} else {
+			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
+		}

 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
 			b.WriteString("**Per-precision stability:**\n\n")
-			b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
+			var precRows [][]string
 			for _, p := range gpu.PrecisionSteady {
 				eccCorr := "—"
 				eccUncorr := "—"
@@ -210,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 				if strings.TrimSpace(status) == "" {
 					status = "OK"
 				}
-				fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
-					p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
-					eccCorr, eccUncorr)
+				precRows = append(precRows, []string{
+					p.Precision, status,
+					fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
+					fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
+					fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
+					eccCorr, eccUncorr,
+				})
 			}
+			b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
 			b.WriteString("\n")
 		} else {
 			// Legacy: show combined-window variance.
@@ -236,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Precision results
 		if len(gpu.PrecisionResults) > 0 {
 			b.WriteString("**Precision results:**\n\n")
-			b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
+			var presRows [][]string
 			for _, p := range gpu.PrecisionResults {
 				if p.Supported {
-					weightStr := fmt.Sprintf("×%.3g", p.Weight)
-					fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
-						p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
+					presRows = append(presRows, []string{
+						p.Name,
+						fmt.Sprintf("%.2f", p.TeraOpsPerSec),
+						fmt.Sprintf("×%.3g", p.Weight),
+						fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
+						fmt.Sprintf("%d", p.Lanes),
+						fmt.Sprintf("%d", p.Iterations),
+					})
 				} else {
-					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
+					presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
 				}
 			}
+			b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
 			b.WriteString("\n")
 		}

@@ -267,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("## Interconnect (NCCL)\n\n")
 		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
-			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
-			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
-			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString(fmtMDTable(
+				[]string{"Metric", "Avg", "Max"},
+				[][]string{
+					{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
+					{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
+				},
+			))
 			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
@@ -280,20 +401,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

-	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	// ── Server Power ───────────────────────────────────────────────────────────
 	if sp := result.ServerPower; sp != nil {
-		b.WriteString("## Server Power (IPMI)\n\n")
+		title := "## Server Power\n\n"
+		if sp.Source != "" {
+			title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
+		}
+		b.WriteString(title)
 		if !sp.Available {
-			b.WriteString("IPMI power measurement unavailable.\n\n")
+			b.WriteString("Server power measurement unavailable.\n\n")
 		} else {
-			b.WriteString("| | Value |\n|---|---|\n")
-			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
-			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
-			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
-			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
-			if sp.ReportingRatio > 0 {
-				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
+			spRows := [][]string{
+				{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
+				{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
+				{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
+				{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
 			}
+			if sp.ReportingRatio > 0 {
+				spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
+			}
+			b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
 			b.WriteString("\n")
 		}
 		for _, note := range sp.Notes {
@@ -304,19 +431,33 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

+	// ── PSU Issues ────────────────────────────────────────────────────────────
+	if len(result.PSUIssues) > 0 {
+		b.WriteString("## PSU Issues\n\n")
+		b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
+		for _, issue := range result.PSUIssues {
+			fmt.Fprintf(&b, "- ⛔ %s\n", issue)
+		}
+		b.WriteString("\n")
+	}
+
 	// ── Cooling ───────────────────────────────────────────────────────────────
 	if cooling := result.Cooling; cooling != nil {
 		b.WriteString("## Cooling\n\n")
 		if cooling.Available {
-			b.WriteString("| Metric | Value |\n|--------|-------|\n")
-			fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
+			dutyAvg, dutyP95 := "N/A", "N/A"
 			if cooling.FanDutyCycleAvailable {
-				fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
-				fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
-			} else {
-				b.WriteString("| Average fan duty cycle | N/A |\n")
-				b.WriteString("| P95 fan duty cycle | N/A |\n")
+				dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
+				dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
 			}
+			b.WriteString(fmtMDTable(
+				[]string{"Metric", "Value"},
+				[][]string{
+					{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
+					{"Average fan duty cycle", dutyAvg},
+					{"P95 fan duty cycle", dutyP95},
+				},
+			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("Cooling telemetry unavailable.\n\n")
@@ -329,6 +470,23 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

+	// ── Platform Scalability ──────────────────────────────────────────────────
+	if len(result.PerformanceRampSteps) > 0 {
+		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
+		var scalRows [][]string
+		for _, step := range result.PerformanceRampSteps {
+			scalRows = append(scalRows, []string{
+				fmt.Sprintf("%d", step.StepIndex),
+				joinIndexList(step.GPUIndices),
+				fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
+				fmt.Sprintf("%.1f%%", step.ScalabilityPct),
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
+		b.WriteString("\n")
+	}
+
 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
--- a/audit/internal/platform/benchmark_table.go
+++ b/audit/internal/platform/benchmark_table.go
@@ -0,0 +1,75 @@
+package platform
+
+import (
+	"strings"
+)
+
+// fmtMDTable renders a markdown table with column widths padded so the table
+// is readable as plain text without a markdown renderer.
+//
+// headers contains the column header strings.
+// rows contains data rows; each row must have the same number of cells as headers.
+// Cells with fewer entries than headers are treated as empty.
+func fmtMDTable(headers []string, rows [][]string) string {
+	ncols := len(headers)
+	if ncols == 0 {
+		return ""
+	}
+
+	// Compute max width per column.
+	widths := make([]int, ncols)
+	for i, h := range headers {
+		if len(h) > widths[i] {
+			widths[i] = len(h)
+		}
+	}
+	for _, row := range rows {
+		for i := 0; i < ncols; i++ {
+			cell := ""
+			if i < len(row) {
+				cell = row[i]
+			}
+			if len(cell) > widths[i] {
+				widths[i] = len(cell)
+			}
+		}
+	}
+
+	var b strings.Builder
+
+	// Header row.
+	b.WriteByte('|')
+	for i, h := range headers {
+		b.WriteByte(' ')
+		b.WriteString(h)
+		b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
+		b.WriteString(" |")
+	}
+	b.WriteByte('\n')
+
+	// Separator row.
+	b.WriteByte('|')
+	for i := range headers {
+		b.WriteString(strings.Repeat("-", widths[i]+2))
+		b.WriteByte('|')
+	}
+	b.WriteByte('\n')
+
+	// Data rows.
+	for _, row := range rows {
+		b.WriteByte('|')
+		for i := 0; i < ncols; i++ {
+			cell := ""
+			if i < len(row) {
+				cell = row[i]
+			}
+			b.WriteByte(' ')
+			b.WriteString(cell)
+			b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
+			b.WriteString(" |")
+		}
+		b.WriteByte('\n')
+	}
+
+	return b.String()
+}
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -1,8 +1,13 @@
 package platform

 import (
+	"context"
+	"fmt"
+	"os/exec"
+	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )

 func TestResolveBenchmarkProfile(t *testing.T) {
@@ -49,8 +54,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
-	if len(labels) != 7 || len(phases) != 7 {
-		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
+	if len(labels) != 5 || len(phases) != 5 {
+		t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
 	}
 	if basePhaseSec != 60 {
 		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
@@ -61,7 +66,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
 		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
 	}
-	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
+	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -80,7 +85,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	if mixedPhaseSec != 3600 {
 		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
+	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -99,7 +104,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	if mixedPhaseSec != 14400 {
 		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
+	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -133,10 +138,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()

-	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
+	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
-	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
+	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
 }
@@ -164,6 +169,99 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	}
 }

+func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
+	t.Parallel()
+
+	before := BenchmarkThrottleCounters{}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
+		t.Fatalf("sw_power_cap should be ignored, got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
+		t.Fatalf("hw_power_brake should be ignored, got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
+		t.Fatalf("hw_thermal mismatch: got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
+		t.Fatalf("sw_thermal mismatch: got %q", got)
+	}
+}
+
+func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
+	oldGeteuid := benchmarkGeteuid
+	oldReset := benchmarkResetNvidiaGPU
+	benchmarkGeteuid = func() int { return 1000 }
+	benchmarkResetNvidiaGPU = func(int) (string, error) {
+		t.Fatal("unexpected reset call")
+		return "", nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkResetNvidiaGPU = oldReset
+	})
+
+	var logs []string
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
+		logs = append(logs, line)
+	})
+	if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
+		t.Fatalf("logs=%q want substring %q", got, want)
+	}
+	if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
+		t.Fatalf("failed=%v want [0 2]", failed)
+	}
+}
+
+func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
+	oldGeteuid := benchmarkGeteuid
+	oldSleep := benchmarkSleep
+	oldReset := benchmarkResetNvidiaGPU
+	benchmarkGeteuid = func() int { return 0 }
+	benchmarkSleep = func(time.Duration) {}
+	var calls []int
+	benchmarkResetNvidiaGPU = func(index int) (string, error) {
+		calls = append(calls, index)
+		return "ok\n", nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkSleep = oldSleep
+		benchmarkResetNvidiaGPU = oldReset
+	})
+
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
+	if len(failed) != 0 {
+		t.Fatalf("failed=%v want no failures", failed)
+	}
+	if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
+		t.Fatalf("calls=%v want %s", calls, want)
+	}
+}
+
+func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
+	oldGeteuid := benchmarkGeteuid
+	oldSleep := benchmarkSleep
+	oldReset := benchmarkResetNvidiaGPU
+	benchmarkGeteuid = func() int { return 0 }
+	benchmarkSleep = func(time.Duration) {}
+	benchmarkResetNvidiaGPU = func(index int) (string, error) {
+		if index == 5 {
+			return "busy\n", exec.ErrNotFound
+		}
+		return "ok\n", nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkSleep = oldSleep
+		benchmarkResetNvidiaGPU = oldReset
+	})
+
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
+	if got, want := fmt.Sprint(failed), "[5]"; got != want {
+		t.Fatalf("failed=%v want %s", failed, want)
+	}
+}
+
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()

@@ -179,6 +277,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	}
 }

+func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name string
+		info benchmarkGPUInfo
+		want int
+	}{
+		{
+			name: "prefers default tdp over current derated limit",
+			info: benchmarkGPUInfo{
+				PowerLimitW:        500,
+				DefaultPowerLimitW: 600,
+				MaxPowerLimitW:     600,
+			},
+			want: 600,
+		},
+		{
+			name: "caps default tdp to reported max limit",
+			info: benchmarkGPUInfo{
+				PowerLimitW:        500,
+				DefaultPowerLimitW: 700,
+				MaxPowerLimitW:     650,
+			},
+			want: 650,
+		},
+		{
+			name: "falls back to current limit when default missing",
+			info: benchmarkGPUInfo{
+				PowerLimitW:    525,
+				MaxPowerLimitW: 600,
+			},
+			want: 525,
+		},
+		{
+			name: "falls back to max limit when only that is known",
+			info: benchmarkGPUInfo{
+				MaxPowerLimitW: 575,
+			},
+			want: 575,
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
+				t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()

@@ -314,12 +465,40 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
 	}
 }

-func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
+	t.Parallel()
+
+	score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
+		PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
+			{Precision: "fp16", WeightedTeraOpsPerSec: 100},
+			{Precision: "fp64", WeightedTeraOpsPerSec: 999},
+			{Precision: "fp4", WeightedTeraOpsPerSec: 999},
+		},
+		PrecisionResults: []BenchmarkPrecisionResult{
+			{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
+			{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
+			{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
+		},
+	})
+
+	if score.SyntheticScore != 100 {
+		t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
+	}
+	if score.MixedScore != 50 {
+		t.Fatalf("MixedScore=%f want 50", score.MixedScore)
+	}
+}
+
+func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Min Power Limit                       : 200.00 W
+    Max Power Limit                       : 600.00 W
+    Default Power Limit                   : 575.00 W
+    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
@@ -341,7 +520,7 @@ GPU 00000000:4F:00.0
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -355,25 +534,49 @@ GPU 00000000:4F:00.0
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
+	}
+	if infoByIndex[0].MaxPowerLimitW != 600 {
+		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
+	}
+	if infoByIndex[0].DefaultPowerLimitW != 575 {
+		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
+	}
+	if infoByIndex[0].PowerLimitW != 560 {
+		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
+	}
 }

-func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
+    Min Power Limit                       : 100.00 W
+    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
-		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+		0: {
+			Index:               0,
+			BusID:               "00000000:4E:00.0",
+			MaxGraphicsClockMHz: 2430,
+			MaxMemoryClockMHz:   12481,
+			MinPowerLimitW:      200,
+			MaxPowerLimitW:      600,
+		},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
+	}
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
 	Available             bool     `json:"available"`
 	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
 	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
 	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
 	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
@@ -42,40 +43,151 @@ const (
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )

+const (
+	BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
+	BenchmarkPowerEngineTargetedPower  = "targeted_power"
+)
+
+// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
+// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
+// re-measure from actual task logs and update the constants here.
+//
+// Sources:
+//   - BenchmarkEstimatedPerfStandardSec:   MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
+//   - BenchmarkEstimatedPerfStabilitySec:  xFusion v8.22 ramp 1-8: 5532 s
+//   - BenchmarkEstimatedPerfOvernightSec:  derived from profile phases (SteadySec=27000)
+//   - BenchmarkEstimatedPowerStandardSec:  MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
+//   - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
+const (
+	// Performance Benchmark (bee-gpu-burn).
+	// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
+	// Sequential per-GPU mode scales approximately linearly.
+	BenchmarkEstimatedPerfStandardSec  = 960  // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
+	BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
+	BenchmarkEstimatedPerfOvernightSec = 8 * 3600
+
+	// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
+	// Duration is for the full ramp-up run; individual steps vary with convergence speed.
+	BenchmarkEstimatedPowerStandardSec  = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
+	BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
+	BenchmarkEstimatedPowerOvernightSec = 3 * 3600
+)
+
 type NvidiaBenchmarkOptions struct {
 	Profile           string
 	SizeMB            int
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
+	ServerPowerSource string
 	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
 	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
 	RampTotal         int    // total number of ramp-up steps in this run
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }

+const (
+	BenchmarkPowerSourceDCMI        = "dcmi"
+	BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
+)
+
+type BenchmarkPowerAutotuneConfig struct {
+	Version           int       `json:"version"`
+	UpdatedAt         time.Time `json:"updated_at"`
+	SelectedSource    string    `json:"selected_source"`
+	BenchmarkKind     string    `json:"benchmark_kind,omitempty"`
+	Profile           string    `json:"profile,omitempty"`
+	IdleDurationSec   int       `json:"idle_duration_sec,omitempty"`
+	LoadDurationSec   int       `json:"load_duration_sec,omitempty"`
+	SampleIntervalSec int       `json:"sample_interval_sec,omitempty"`
+	Confidence        float64   `json:"confidence,omitempty"`
+	Reason            string    `json:"reason,omitempty"`
+}
+
+type SystemPowerSourceDecision struct {
+	Configured      bool      `json:"configured"`
+	SelectedSource  string    `json:"selected_source,omitempty"`
+	EffectiveSource string    `json:"effective_source,omitempty"`
+	Mode            string    `json:"mode,omitempty"` // autotuned, fallback, degraded
+	Reason          string    `json:"reason,omitempty"`
+	ConfiguredAt    time.Time `json:"configured_at,omitempty"`
+}
+
+type BenchmarkPowerAutotuneResult struct {
+	GeneratedAt         time.Time                         `json:"generated_at"`
+	Hostname            string                            `json:"hostname,omitempty"`
+	ServerModel         string                            `json:"server_model,omitempty"`
+	BenchmarkKind       string                            `json:"benchmark_kind,omitempty"`
+	Profile             string                            `json:"profile,omitempty"`
+	Status              string                            `json:"status"`
+	IdleDurationSec     int                               `json:"idle_duration_sec"`
+	LoadDurationSec     int                               `json:"load_duration_sec"`
+	SampleIntervalSec   int                               `json:"sample_interval_sec"`
+	SelectedSource      string                            `json:"selected_source,omitempty"`
+	IdleValidationError string                            `json:"idle_validation_error,omitempty"`
+	IdleValidation      *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
+	GPUPowerIdleW       float64                           `json:"gpu_power_idle_w,omitempty"`
+	GPUPowerLoadW       float64                           `json:"gpu_power_load_w,omitempty"`
+	Candidates          []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
+	Notes               []string                          `json:"notes,omitempty"`
+	Config              *BenchmarkPowerAutotuneConfig     `json:"config,omitempty"`
+}
+
+type BenchmarkPowerAutotuneValidation struct {
+	Valid          bool    `json:"valid"`
+	GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
+	GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
+	CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
+	CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
+	GPUSamples     int     `json:"gpu_samples,omitempty"`
+	CPUSamples     int     `json:"cpu_samples,omitempty"`
+	Reason         string  `json:"reason,omitempty"`
+}
+
+type BenchmarkPowerAutotuneCandidate struct {
+	Source         string  `json:"source"`
+	IdleAvgW       float64 `json:"idle_avg_w,omitempty"`
+	LoadAvgW       float64 `json:"load_avg_w,omitempty"`
+	DeltaW         float64 `json:"delta_w,omitempty"`
+	Samples        int     `json:"samples,omitempty"`
+	RelativeError  float64 `json:"relative_error,omitempty"`
+	Confidence     float64 `json:"confidence,omitempty"`
+	Selected       bool    `json:"selected,omitempty"`
+	Available      bool    `json:"available"`
+	SelectionNotes string  `json:"selection_notes,omitempty"`
+}
+
 type NvidiaBenchmarkResult struct {
-	BenchmarkVersion   string                       `json:"benchmark_version"`
-	GeneratedAt        time.Time                    `json:"generated_at"`
-	Hostname           string                       `json:"hostname,omitempty"`
-	ServerModel        string                       `json:"server_model,omitempty"`
-	BenchmarkProfile   string                       `json:"benchmark_profile"`
-	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
-	RampStep           int                          `json:"ramp_step,omitempty"`
-	RampTotal          int                          `json:"ramp_total,omitempty"`
-	RampRunID          string                       `json:"ramp_run_id,omitempty"`
-	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
-	OverallStatus      string                       `json:"overall_status"`
-	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
-	Findings           []string                     `json:"findings,omitempty"`
-	Warnings           []string                     `json:"warnings,omitempty"`
-	Normalization      BenchmarkNormalization       `json:"normalization"`
-	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
-	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
-	Cooling            *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
-	GPUs               []BenchmarkGPUResult         `json:"gpus"`
-	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
-	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+	BenchmarkVersion string    `json:"benchmark_version"`
+	GeneratedAt      time.Time `json:"generated_at"`
+	Hostname         string    `json:"hostname,omitempty"`
+	ServerModel      string    `json:"server_model,omitempty"`
+	BenchmarkProfile string    `json:"benchmark_profile"`
+	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
+	RampStep         int       `json:"ramp_step,omitempty"`
+	RampTotal        int       `json:"ramp_total,omitempty"`
+	RampRunID        string    `json:"ramp_run_id,omitempty"`
+	ScalabilityScore float64   `json:"scalability_score,omitempty"`
+	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
+	// 100% = each added GPU contributes exactly its single-card throughput.
+	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
+	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
+	OverallStatus        string                       `json:"overall_status"`
+	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
+	Findings             []string                     `json:"findings,omitempty"`
+	Warnings             []string                     `json:"warnings,omitempty"`
+	Normalization        BenchmarkNormalization       `json:"normalization"`
+	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
+	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
+	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
+	// sensor states before and after the benchmark run. Empty when IPMI is
+	// unavailable or no PSU faults occurred during the test.
+	PSUIssues []string `json:"psu_issues,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -107,6 +219,12 @@ type BenchmarkGPUResult struct {
 	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
 	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
+	// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
+	// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
+	ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
+	// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
+	// Fallback: 80°C.
+	SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
@@ -206,25 +324,87 @@ type BenchmarkScorecard struct {
 	MixedEfficiency     float64 `json:"mixed_efficiency,omitempty"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
-	StabilityScore      float64 `json:"stability_score"`
-	InterconnectScore   float64 `json:"interconnect_score"`
-	CompositeScore      float64 `json:"composite_score"`
+	// StabilityScore: fraction of steady-state time the GPU spent throttling
+	// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
+	StabilityScore float64 `json:"stability_score"`
+
+	// Throttle breakdown — percentage of steady-state time in each throttle type.
+	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
+	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
+	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
+	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
+
+	// Temperature headroom: distance to the 100°C destruction threshold.
+	// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
+	// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
+	TempHeadroomC float64 `json:"temp_headroom_c"`
+
+	InterconnectScore float64 `json:"interconnect_score"`
+	// ServerQualityScore (0–100) reflects server infrastructure quality independent
+	// of GPU model. Combines throttle time, power variance, and temp variance.
+	// Use this to compare servers with the same GPU, or to flag a bad server
+	// that throttles an otherwise fast GPU.
+	ServerQualityScore float64 `json:"server_quality_score"`
+	// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
+	// A throttling GPU will score lower here automatically — no quality multiplier.
+	CompositeScore float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }

-// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
-// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
-// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
-// over-reporting its power consumption.
+// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
+// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
+// so benchmark and audit data can be correlated by slot.
+type BenchmarkPSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`  // AC wall input (PSUx_POWER_IN)
+	OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
+	Status  string   `json:"status,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power from multiple independent
+// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
+// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
+// covers only a subset of installed PSUs (partial coverage).
+//
+// Source legend:
+//   - DCMI      — `ipmitool dcmi power reading`; fast but may miss PSUs
+//   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
+//   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
-	Available       bool     `json:"available"`
-	IdleW           float64  `json:"idle_w,omitempty"`
-	LoadedW         float64  `json:"loaded_w,omitempty"`
-	DeltaW          float64  `json:"delta_w,omitempty"`
-	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
-	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
-	Notes           []string `json:"notes,omitempty"`
+	Available         bool    `json:"available"`
+	Source            string  `json:"source,omitempty"`
+	Mode              string  `json:"mode,omitempty"`
+	Reason            string  `json:"reason,omitempty"`
+	SampleIntervalSec int     `json:"sample_interval_sec,omitempty"`
+	IdleW             float64 `json:"idle_w,omitempty"`   // DCMI at idle
+	LoadedW           float64 `json:"loaded_w,omitempty"` // DCMI at peak load
+	DeltaW            float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
+	GPUReportedSumW   float64 `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio    float64 `json:"reporting_ratio,omitempty"`
+
+	// PSU AC input sum — sampled at idle and at peak load using collector's
+	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
+	PSUInputIdleW   float64 `json:"psu_input_idle_w,omitempty"`
+	PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
+
+	// PSU DC output sum — power delivered to server internals after conversion.
+	PSUOutputIdleW   float64 `json:"psu_output_idle_w,omitempty"`
+	PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
+
+	// Per-slot PSU readings at idle and at peak load.
+	// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
+	PSUSlotReadingsIdle   map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
+	PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
+
+	// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
+	// PCIe slot delivery only (excludes 16-pin connector power).
+	GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
+
+	// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
+	// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
+	DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
+
+	Notes []string `json:"notes,omitempty"`
 }

 // BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
@@ -265,16 +445,35 @@ type NvidiaPowerBenchResult struct {
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
-	Findings             []string               `json:"findings,omitempty"`
-	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
+	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
+	// cumulative thermal ramp. Represents the actual sustained power budget of
+	// this server under full GPU load. Use for rack power planning.
+	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
+	// ServerPower captures IPMI server power delta (idle→loaded) measured in
+	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
+	// actual wall-power draw as seen by the server's power supply.
+	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
+	Findings    []string              `json:"findings,omitempty"`
+	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
+	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
+	// sensor states before and after the power benchmark run. Empty when IPMI is
+	// unavailable or no PSU faults occurred during the test.
+	PSUIssues []string `json:"psu_issues,omitempty"`
 }

 type NvidiaPowerBenchGPU struct {
-	Index               int      `json:"index"`
-	Name                string   `json:"name,omitempty"`
-	BusID               string   `json:"bus_id,omitempty"`
-	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
-	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	Index              int     `json:"index"`
+	Name               string  `json:"name,omitempty"`
+	BusID              string  `json:"bus_id,omitempty"`
+	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
+	// AppliedPowerLimitW is the stable limit found during single-card calibration.
+	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
+	// StablePowerLimitW is the final fixed limit for this GPU after the
+	// cumulative thermal ramp. This is the limit at which the GPU operated
+	// stably with all other GPUs running simultaneously at their own limits.
+	// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
+	// additional derating.
+	StablePowerLimitW   float64  `json:"stable_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
@@ -283,16 +482,55 @@ type NvidiaPowerBenchGPU struct {
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// Telemetry holds the aggregated stats from the final converged calibration
+	// attempt for this GPU (temperature, power, fan, clock percentiles).
+	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
+	// Fan state sampled at the end of single-card calibration.
+	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
+	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
 }

 type NvidiaPowerBenchStep struct {
-	StepIndex              int      `json:"step_index"`
-	GPUIndices             []int    `json:"gpu_indices"`
-	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
-	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
-	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
-	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
-	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
-	Status                 string   `json:"status"`
-	Notes                  []string `json:"notes,omitempty"`
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// NewGPUIndex is the GPU whose stable limit was searched in this step.
+	NewGPUIndex int `json:"new_gpu_index"`
+	// NewGPUStableLimitW is the stable power limit found for the new GPU.
+	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
+	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
+	AvgObservedPowerW   float64  `json:"avg_observed_power_w,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
+	Status              string   `json:"status"`
+	Notes               []string `json:"notes,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// PSU slot readings sampled at end of this ramp step.
+	PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
+	// Fan state at end of this ramp step.
+	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
+	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
+	// Per-GPU telemetry from this step's calibration, keyed by GPU index.
+	PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
+}
+
+// NvidiaPerformanceRampStep holds per-step performance data for the
+// scalability ramp-up phase of the performance benchmark.
+type NvidiaPerformanceRampStep struct {
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
+	// TOPS from dedicated single-precision phases) across all GPUs in this step.
+	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
+	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
+	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
+	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
+	ScalabilityPct float64  `json:"scalability_pct"`
+	Status         string   `json:"status"`
+	Notes          []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/error_patterns.go
+++ b/audit/internal/platform/error_patterns.go
@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
 		Category: "gpu",
 		Severity: "warning",
 	},
+	// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
+	// Severity is warning (not critical): correctable errors are hardware-recovered.
+	{
+		Name:     "nvidia-aer-correctable",
+		Re:       mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
 	{
 		Name:     "nvidia-aer",
 		Re:       mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
 	},

 	// ── PCIe AER (generic) ──────────────────────────────────────────────────────
+	// PCIe AER correctable from the root port — captures the reported device BDF
+	// (second BDF in "pcieport X: AER: Correctable error received: Y").
+	{
+		Name:     "pcie-aer-correctable",
+		Re:       mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
 	{
 		Name:     "pcie-aer",
 		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
 	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
 	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
 	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
 		dutyAvail := 0
 		if r.FanDutyCycleAvailable {
 			dutyAvail = 1
 		}
-		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
-			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
+		dutyEstimated := 0
+		if r.FanDutyCycleEstimated {
+			dutyEstimated = 1
+		}
+		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
+			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
--- a/audit/internal/platform/install.go
+++ b/audit/internal/platform/install.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
+	"path/filepath"
 	"strconv"
 	"strings"
 )
@@ -18,7 +19,7 @@ type InstallDisk struct {
 	MountedParts []string // partition mount points currently active
 }

-const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
+const squashfsGlob = "/run/live/medium/live/*.squashfs"

 // ListInstallDisks returns block devices suitable for installation.
 // Excludes the current live boot medium but includes USB drives.
@@ -176,11 +177,22 @@ func inferLiveBootKind(fsType, source, deviceType, transport string) string {
 // squashfs size × 1.5 to allow for extracted filesystem and bootloader.
 // Returns 0 if the squashfs is not available (non-live environment).
 func MinInstallBytes() int64 {
-	fi, err := os.Stat(squashfsPath)
-	if err != nil {
+	files, err := filepath.Glob(squashfsGlob)
+	if err != nil || len(files) == 0 {
 		return 0
 	}
-	return fi.Size() * 3 / 2
+	var total int64
+	for _, path := range files {
+		fi, statErr := os.Stat(path)
+		if statErr != nil {
+			continue
+		}
+		total += fi.Size()
+	}
+	if total == 0 {
+		return 0
+	}
+	return total * 3 / 2
 }

 // toramActive returns true when the live system was booted with toram.
@@ -222,12 +234,10 @@ func DiskWarnings(d InstallDisk) []string {
 			humanBytes(min), humanBytes(d.SizeBytes)))
 	}
 	if toramActive() {
-		sqFi, err := os.Stat(squashfsPath)
-		if err == nil {
-			free := freeMemBytes()
-			if free > 0 && free < sqFi.Size()*2 {
-				w = append(w, "toram mode — low RAM, extraction may be slow or fail")
-			}
+		free := freeMemBytes()
+		min := MinInstallBytes()
+		if free > 0 && min > 0 && free < (min*4/3) {
+			w = append(w, "toram mode — low RAM, extraction may be slow or fail")
 		}
 	}
 	return w
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -12,6 +12,23 @@ import (
 )

 const installToRAMDir = "/dev/shm/bee-live"
+const copyProgressLogStep int64 = 100 * 1024 * 1024
+
+var liveMediumSquashfsGlob = func() ([]string, error) {
+	return filepath.Glob("/run/live/medium/live/*.squashfs")
+}
+
+var runRemountMedium = func() ([]byte, error) {
+	return exec.Command("bee-remount-medium").CombinedOutput()
+}
+
+var umountLiveMedium = func() error {
+	return exec.Command("umount", "/run/live/medium").Run()
+}
+
+var ejectDevice = func(device string) error {
+	return exec.Command("eject", device).Run()
+}

 func (s *System) IsLiveMediaInRAM() bool {
 	return s.LiveMediaRAMState().InRAM
@@ -139,27 +156,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 		return nil
 	}

-	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
-	if err != nil || len(squashfsFiles) == 0 {
-		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
-	}
-
-	free := freeMemBytes()
-	var needed int64
-	for _, sf := range squashfsFiles {
-		fi, err2 := os.Stat(sf)
-		if err2 != nil {
-			return fmt.Errorf("stat %s: %v", sf, err2)
-		}
-		needed += fi.Size()
-	}
-	const headroom = 256 * 1024 * 1024
-	if free > 0 && needed+headroom > free {
-		return fmt.Errorf("insufficient RAM: need %s, available %s",
-			humanBytes(needed+headroom), humanBytes(free))
-	}
+	squashfsFiles, sourceAvailable := ensureLiveMediumAvailable(log)

 	dstDir := installToRAMDir
+
+	// If the source medium is unavailable, check whether a previous run already
+	// produced a complete copy in RAM. If so, skip the copy phase and proceed
+	// directly to the loop-rebind / bind-mount steps.
+	if !sourceAvailable {
+		copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
+		if len(copiedFiles) > 0 {
+			log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
+			// Proceed to rebind with the already-copied files.
+			for _, dst := range copiedFiles {
+				base := filepath.Base(dst)
+				// Re-associate the loop device that was originally backed by the
+				// source file (now gone); find it by the old source path pattern.
+				srcGuess := "/run/live/medium/live/" + base
+				loopDev, lerr := findLoopForFile(srcGuess)
+				if lerr != nil {
+					log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
+					continue
+				}
+				if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
+					log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
+				} else {
+					log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
+				}
+			}
+			goto bindMedium
+		}
+		return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry (or run bee-remount-medium as root)", dstDir)
+	}
+
+	{
+		free := freeMemBytes()
+		var needed int64
+		for _, sf := range squashfsFiles {
+			fi, err2 := os.Stat(sf)
+			if err2 != nil {
+				return fmt.Errorf("stat %s: %v", sf, err2)
+			}
+			needed += fi.Size()
+		}
+		const headroom = 256 * 1024 * 1024
+		if free > 0 && needed+headroom > free {
+			return fmt.Errorf("insufficient RAM: need %s, available %s",
+				humanBytes(needed+headroom), humanBytes(free))
+		}
+	}
+
 	if state.CopyPresent {
 		log("Removing stale partial RAM copy before retry...")
 	}
@@ -199,6 +245,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 		}
 	}

+bindMedium:
 	log("Copying remaining medium files...")
 	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
 		log(fmt.Sprintf("Warning: partial copy: %v", err))
@@ -222,10 +269,83 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 	if status.InRAM {
 		log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
 	}
-	log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
+	detachInstallMedium(status, log)
+	log("Done. Squashfs files are in RAM. Installation media has been detached when possible.")
 	return nil
 }

+func tryRemountLiveMedium(log func(string)) error {
+	output, err := runRemountMedium()
+	trimmed := strings.TrimSpace(string(output))
+	if err != nil {
+		if trimmed != "" && log != nil {
+			for _, line := range strings.Split(trimmed, "\n") {
+				log("bee-remount-medium: " + line)
+			}
+		}
+		return err
+	}
+	if trimmed != "" && log != nil {
+		for _, line := range strings.Split(trimmed, "\n") {
+			log("bee-remount-medium: " + line)
+		}
+	}
+	return nil
+}
+
+func ensureLiveMediumAvailable(log func(string)) ([]string, bool) {
+	squashfsFiles, err := liveMediumSquashfsGlob()
+	sourceAvailable := err == nil && len(squashfsFiles) > 0
+	if sourceAvailable {
+		return squashfsFiles, true
+	}
+
+	if log != nil {
+		log("Live medium not mounted at /run/live/medium — attempting automatic remount scan...")
+	}
+	if remountErr := tryRemountLiveMedium(log); remountErr != nil {
+		if log != nil {
+			log(fmt.Sprintf("Automatic remount did not restore the live medium: %v", remountErr))
+		}
+		return squashfsFiles, false
+	}
+
+	squashfsFiles, err = liveMediumSquashfsGlob()
+	sourceAvailable = err == nil && len(squashfsFiles) > 0
+	if sourceAvailable && log != nil {
+		log("Live medium restored after remount scan.")
+	}
+	return squashfsFiles, sourceAvailable
+}
+
+func detachInstallMedium(status LiveBootSource, log func(string)) {
+	if log == nil {
+		log = func(string) {}
+	}
+
+	log("Detaching original installation medium...")
+	if err := umountLiveMedium(); err != nil {
+		log(fmt.Sprintf("Warning: could not unmount /run/live/medium: %v", err))
+	} else {
+		log("Unmounted /run/live/medium.")
+	}
+
+	device := strings.TrimSpace(status.Device)
+	if device == "" {
+		device = strings.TrimSpace(status.Source)
+	}
+	if device == "" || !strings.HasPrefix(device, "/dev/") {
+		log("No block device identified for eject; skipping media eject.")
+		return
+	}
+
+	if err := ejectDevice(device); err != nil {
+		log(fmt.Sprintf("Warning: could not eject %s: %v", device, err))
+		return
+	}
+	log(fmt.Sprintf("Ejected %s.", device))
+}
+
 func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
 	if status.InRAM {
 		return nil
@@ -288,6 +408,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	defer out.Close()
 	total := fi.Size()
 	var copied int64
+	var lastLogged int64
 	buf := make([]byte, 4*1024*1024)
 	for {
 		if err := ctx.Err(); err != nil {
@@ -299,7 +420,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 				return werr
 			}
 			copied += int64(n)
-			if logFunc != nil && total > 0 {
+			if shouldLogCopyProgress(copied, total, lastLogged) {
+				lastLogged = copied
 				pct := int(float64(copied) / float64(total) * 100)
 				logFunc(fmt.Sprintf("  %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
 			}
@@ -314,6 +436,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	return out.Sync()
 }

+func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
+	if total <= 0 || copied <= 0 {
+		return false
+	}
+	if copied >= total {
+		return copied > lastLogged
+	}
+	if copied < copyProgressLogStep {
+		return false
+	}
+	return copied-lastLogged >= copyProgressLogStep
+}
+
 func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
 	return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
 		if ctx.Err() != nil {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -1,6 +1,9 @@
 package platform

-import "testing"
+import (
+	"fmt"
+	"testing"
+)

 func TestInferLiveBootKind(t *testing.T) {
 	t.Parallel()
@@ -101,3 +104,179 @@ func TestEvaluateLiveMediaRAMState(t *testing.T) {
 		}
 	})
 }
+
+func TestShouldLogCopyProgress(t *testing.T) {
+	t.Parallel()
+
+	total := int64(250 * 1024 * 1024)
+	step := int64(100 * 1024 * 1024)
+
+	if shouldLogCopyProgress(step-1, total, 0) {
+		t.Fatal("progress logged too early")
+	}
+	if !shouldLogCopyProgress(step, total, 0) {
+		t.Fatal("expected log at first 100MB boundary")
+	}
+	if shouldLogCopyProgress(step+16*1024*1024, total, step) {
+		t.Fatal("progress logged again before next 100MB")
+	}
+	if !shouldLogCopyProgress(2*step, total, step) {
+		t.Fatal("expected log at second 100MB boundary")
+	}
+	if !shouldLogCopyProgress(total, total, 2*step) {
+		t.Fatal("expected final completion log")
+	}
+}
+
+func TestTryRemountLiveMedium(t *testing.T) {
+	t.Parallel()
+
+	orig := runRemountMedium
+	t.Cleanup(func() {
+		runRemountMedium = orig
+	})
+
+	t.Run("success", func(t *testing.T) {
+		runRemountMedium = func() ([]byte, error) {
+			return []byte("[10:57:31] Mounted /dev/sr1 on /run/live/medium\n"), nil
+		}
+		var logs []string
+		if err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) }); err != nil {
+			t.Fatalf("tryRemountLiveMedium() error = %v", err)
+		}
+		if len(logs) != 1 || logs[0] != "bee-remount-medium: [10:57:31] Mounted /dev/sr1 on /run/live/medium" {
+			t.Fatalf("logs=%v", logs)
+		}
+	})
+
+	t.Run("failure", func(t *testing.T) {
+		runRemountMedium = func() ([]byte, error) {
+			return []byte("must be run as root\n"), fmt.Errorf("exit status 1")
+		}
+		var logs []string
+		err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) })
+		if err == nil {
+			t.Fatal("expected error")
+		}
+		if len(logs) != 1 || logs[0] != "bee-remount-medium: must be run as root" {
+			t.Fatalf("logs=%v", logs)
+		}
+	})
+}
+
+func TestEnsureLiveMediumAvailableRemountsSource(t *testing.T) {
+	t.Parallel()
+
+	origGlob := liveMediumSquashfsGlob
+	origRemount := runRemountMedium
+	t.Cleanup(func() {
+		liveMediumSquashfsGlob = origGlob
+		runRemountMedium = origRemount
+	})
+
+	callCount := 0
+	liveMediumSquashfsGlob = func() ([]string, error) {
+		callCount++
+		if callCount == 1 {
+			return nil, nil
+		}
+		return []string{"/run/live/medium/live/filesystem.squashfs"}, nil
+	}
+	runRemountMedium = func() ([]byte, error) {
+		return []byte("Mounted /dev/sr1 on /run/live/medium\n"), nil
+	}
+
+	var logs []string
+	files, ok := ensureLiveMediumAvailable(func(msg string) { logs = append(logs, msg) })
+	if !ok {
+		t.Fatal("expected live medium to become available after remount")
+	}
+	if callCount < 2 {
+		t.Fatalf("liveMediumSquashfsGlob called %d times, want at least 2", callCount)
+	}
+	if len(files) != 1 || files[0] != "/run/live/medium/live/filesystem.squashfs" {
+		t.Fatalf("files=%v", files)
+	}
+	found := false
+	for _, msg := range logs {
+		if msg == "Live medium restored after remount scan." {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Fatalf("expected remount success log, logs=%v", logs)
+	}
+}
+
+func TestDetachInstallMedium(t *testing.T) {
+	t.Parallel()
+
+	origUmount := umountLiveMedium
+	origEject := ejectDevice
+	t.Cleanup(func() {
+		umountLiveMedium = origUmount
+		ejectDevice = origEject
+	})
+
+	t.Run("success", func(t *testing.T) {
+		var umountCalled bool
+		var ejected string
+		umountLiveMedium = func() error {
+			umountCalled = true
+			return nil
+		}
+		ejectDevice = func(device string) error {
+			ejected = device
+			return nil
+		}
+		var logs []string
+		detachInstallMedium(LiveBootSource{Kind: "cdrom", Device: "/dev/sr1"}, func(msg string) { logs = append(logs, msg) })
+		if !umountCalled {
+			t.Fatal("expected umountLiveMedium to be called")
+		}
+		if ejected != "/dev/sr1" {
+			t.Fatalf("ejected=%q want /dev/sr1", ejected)
+		}
+		if len(logs) < 3 {
+			t.Fatalf("logs=%v", logs)
+		}
+	})
+
+	t.Run("no device", func(t *testing.T) {
+		umountLiveMedium = func() error { return nil }
+		ejectDevice = func(device string) error {
+			t.Fatalf("unexpected eject for %q", device)
+			return nil
+		}
+		var logs []string
+		detachInstallMedium(LiveBootSource{Kind: "ram", Source: "tmpfs"}, func(msg string) { logs = append(logs, msg) })
+		found := false
+		for _, msg := range logs {
+			if msg == "No block device identified for eject; skipping media eject." {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("logs=%v", logs)
+		}
+	})
+
+	t.Run("eject failure is warning only", func(t *testing.T) {
+		umountLiveMedium = func() error { return nil }
+		ejectDevice = func(device string) error { return fmt.Errorf("exit status 1") }
+		var logs []string
+		detachInstallMedium(LiveBootSource{Kind: "usb", Device: "/dev/sdb1"}, func(msg string) { logs = append(logs, msg) })
+		found := false
+		for _, msg := range logs {
+			if msg == "Warning: could not eject /dev/sdb1: exit status 1" {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("logs=%v", logs)
+		}
+	})
+}
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -1,11 +1,14 @@
 package platform

 import (
+	"context"
 	"fmt"
+	"log/slog"
 	"os"
 	"strconv"
 	"strings"
 	"syscall"
+	"time"
 )

 // workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
@@ -15,6 +18,7 @@ var workerPatterns = []string{
 	"stress-ng",
 	"stressapptest",
 	"memtester",
+	"nvbandwidth",
 	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
 	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
 	"nvvs",
@@ -30,7 +34,12 @@ type KilledProcess struct {
 // KillTestWorkers scans /proc for running test worker processes and sends
 // SIGKILL to each one found. It returns a list of killed processes.
 // Errors for individual processes (e.g. already exited) are silently ignored.
+// The scan runs under a 5-second deadline to avoid blocking if the process
+// table is very large (e.g. after a stress test with thousands of children).
 func KillTestWorkers() []KilledProcess {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
 	entries, err := os.ReadDir("/proc")
 	if err != nil {
 		return nil
@@ -38,6 +47,13 @@ func KillTestWorkers() []KilledProcess {

 	var killed []KilledProcess
 	for _, e := range entries {
+		select {
+		case <-ctx.Done():
+			slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
+			return killed
+		default:
+		}
+
 		if !e.IsDir() {
 			continue
 		}
@@ -56,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
 		if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
 			base = exe[idx+1:]
 		}
-		for _, pat := range workerPatterns {
-			if strings.Contains(base, pat) || strings.Contains(exe, pat) {
-				_ = syscall.Kill(pid, syscall.SIGKILL)
-				killed = append(killed, KilledProcess{PID: pid, Name: base})
-				break
-			}
+		if shouldKillWorkerProcess(exe, base) {
+			_ = syscall.Kill(pid, syscall.SIGKILL)
+			killed = append(killed, KilledProcess{PID: pid, Name: base})
 		}
 	}
 	return killed
 }
+
+func shouldKillWorkerProcess(exe, base string) bool {
+	for _, pat := range workerPatterns {
+		if strings.Contains(base, pat) || strings.Contains(exe, pat) {
+			return true
+		}
+	}
+	return false
+}
--- a/audit/internal/platform/kill_workers_test.go
+++ b/audit/internal/platform/kill_workers_test.go
@@ -0,0 +1,39 @@
+package platform
+
+import "testing"
+
+func TestShouldKillWorkerProcess(t *testing.T) {
+	tests := []struct {
+		name string
+		exe  string
+		base string
+		want bool
+	}{
+		{
+			name: "nvbandwidth executable",
+			exe:  "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
+			base: "nvbandwidth",
+			want: true,
+		},
+		{
+			name: "dcgmi executable",
+			exe:  "/usr/bin/dcgmi",
+			base: "dcgmi",
+			want: true,
+		},
+		{
+			name: "unrelated process",
+			exe:  "/usr/bin/bash",
+			base: "bash",
+			want: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
+				t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
+			}
+		})
+	}
+}
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -1,8 +1,10 @@
 package platform

 import (
+	"bee/audit/internal/collector"
 	"bufio"
 	"encoding/json"
+	"fmt"
 	"os"
 	"os/exec"
 	"sort"
@@ -14,13 +16,24 @@ import (
 // LiveMetricSample is a single point-in-time snapshot of server metrics
 // collected for the web UI metrics page.
 type LiveMetricSample struct {
-	Timestamp  time.Time      `json:"ts"`
-	Fans       []FanReading   `json:"fans"`
-	Temps      []TempReading  `json:"temps"`
-	PowerW     float64        `json:"power_w"`
-	CPULoadPct float64        `json:"cpu_load_pct"`
-	MemLoadPct float64        `json:"mem_load_pct"`
-	GPUs       []GPUMetricRow `json:"gpus"`
+	Timestamp   time.Time      `json:"ts"`
+	Fans        []FanReading   `json:"fans"`
+	Temps       []TempReading  `json:"temps"`
+	PowerW      float64        `json:"power_w"`
+	PowerSource string         `json:"power_source,omitempty"`
+	PowerMode   string         `json:"power_mode,omitempty"`
+	PowerReason string         `json:"power_reason,omitempty"`
+	PSUs        []PSUReading   `json:"psus,omitempty"`
+	CPULoadPct  float64        `json:"cpu_load_pct"`
+	MemLoadPct  float64        `json:"mem_load_pct"`
+	GPUs        []GPUMetricRow `json:"gpus"`
+}
+
+// PSUReading is a per-slot power supply input power reading.
+type PSUReading struct {
+	Slot   int     `json:"slot"`
+	Name   string  `json:"name"`
+	PowerW float64 `json:"power_w"`
 }

 // TempReading is a named temperature sensor value.
@@ -54,8 +67,17 @@ func SampleLiveMetrics() LiveMetricSample {
 		}
 	}

-	// System power — returns 0 if unavailable
-	s.PowerW = sampleSystemPower()
+	// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
+	s.PSUs = samplePSUPower()
+
+	// System power: use the global autotune-selected source when configured,
+	// otherwise fall back to the historical heuristic and mark the mode.
+	if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
+		s.PowerW = powerW
+		s.PowerSource = decision.EffectiveSource
+		s.PowerMode = decision.Mode
+		s.PowerReason = decision.Reason
+	}

 	// CPU load — from /proc/stat
 	s.CPULoadPct = sampleCPULoadPct()
@@ -326,3 +348,46 @@ func compactAmbientTempName(chip, name string) string {
 	}
 	return chip + " / " + name
 }
+
+// samplePSUPower reads per-PSU input power via IPMI SDR.
+// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
+// vendors where PSU sensors may not carry entity ID "10.N".
+// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
+func samplePSUPower() []PSUReading {
+	out, err := exec.Command("ipmitool", "sdr").Output()
+	if err != nil || len(out) == 0 {
+		return nil
+	}
+	slots := collector.PSUSlotsFromSDR(string(out))
+	if len(slots) == 0 {
+		return nil
+	}
+	// Collect slot keys and sort for stable output.
+	keys := make([]int, 0, len(slots))
+	for k := range slots {
+		n, err := strconv.Atoi(k)
+		if err == nil {
+			keys = append(keys, n)
+		}
+	}
+	sort.Ints(keys)
+	psus := make([]PSUReading, 0, len(keys))
+	for _, k := range keys {
+		entry := slots[strconv.Itoa(k)]
+		// Prefer AC input power; fall back to DC output power.
+		var w float64
+		if entry.InputW != nil && *entry.InputW > 0 {
+			w = *entry.InputW
+		} else if entry.OutputW != nil && *entry.OutputW > 0 {
+			w = *entry.OutputW
+		}
+		if w <= 0 {
+			continue
+		}
+		psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
+	}
+	if len(psus) == 0 {
+		return nil
+	}
+	return psus
+}
--- a/audit/internal/platform/network.go
+++ b/audit/internal/platform/network.go
@@ -258,7 +258,7 @@ func (s *System) GetInterfaceState(iface string) (bool, error) {
 func interfaceAdminState(iface string) (bool, error) {
 	raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
 	if err != nil {
-		return false, err
+		return false, fmt.Errorf("ip link show dev %s: %w", iface, err)
 	}
 	return parseInterfaceAdminState(string(raw))
 }
@@ -288,7 +288,7 @@ func interfaceIPv4Addrs(iface string) ([]string, error) {
 		if errors.As(err, &exitErr) {
 			return nil, nil
 		}
-		return nil, err
+		return nil, fmt.Errorf("ip addr show dev %s: %w", iface, err)
 	}
 	var ipv4 []string
 	for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
--- a/audit/internal/platform/nvidia_recover.go
+++ b/audit/internal/platform/nvidia_recover.go
@@ -0,0 +1,51 @@
+package platform
+
+import (
+	"fmt"
+	"os/exec"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
+
+func runNvidiaRecover(args ...string) (string, error) {
+	helperArgs := append([]string{nvidiaRecoverHelper}, args...)
+	if _, err := exec.LookPath("systemd-run"); err == nil {
+		unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
+		cmdArgs := []string{
+			"systemd-run",
+			"--quiet",
+			"--pipe",
+			"--wait",
+			"--collect",
+			"--service-type=oneshot",
+			"--unit", unit,
+		}
+		cmdArgs = append(cmdArgs, helperArgs...)
+		raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
+		return string(raw), err
+	}
+	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
+	return string(raw), err
+}
+
+func resetNvidiaGPU(index int) (string, error) {
+	if index < 0 {
+		return "", fmt.Errorf("gpu index must be >= 0")
+	}
+	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "GPU reset completed.\n"
+	}
+	return out, err
+}
+
+func restartNvidiaDrivers() (string, error) {
+	out, err := runNvidiaRecover("restart-drivers")
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "NVIDIA drivers restarted.\n"
+	}
+	return out, err
+}
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
 	"bee-audit",
 	"bee-web",
 	"bee-sshsetup",
+	"nvidia-dcgm",
+	"nvidia-fabricmanager",
 }

 func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
@@ -53,7 +55,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 	if err == nil {
 		health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
 		hasIPv4 := false
-		missingIPv4 := false
 		for _, iface := range interfaces {
 			outcome := "no_offer"
 			if len(iface.IPv4) > 0 {
@@ -61,8 +62,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 				hasIPv4 = true
 			} else if strings.EqualFold(iface.State, "DOWN") {
 				outcome = "link_down"
-			} else {
-				missingIPv4 = true
 			}
 			health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
 				Name:    iface.Name,
@@ -71,17 +70,9 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 				Outcome: outcome,
 			})
 		}
-		switch {
-		case hasIPv4 && !missingIPv4:
+		if hasIPv4 {
 			health.NetworkStatus = "OK"
-		case hasIPv4:
-			health.NetworkStatus = "PARTIAL"
-			health.Issues = append(health.Issues, schema.RuntimeIssue{
-				Code:        "dhcp_partial",
-				Severity:    "warning",
-				Description: "At least one interface did not obtain IPv4 connectivity.",
-			})
-		default:
+		} else {
 			health.NetworkStatus = "FAILED"
 			health.Issues = append(health.Issues, schema.RuntimeIssue{
 				Code:        "dhcp_failed",
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -20,6 +20,54 @@ import (
 	"time"
 )

+// Estimated wall-clock durations for each SAT/validate test, derived from real
+// production logs in _benchmark/_v8/.
+//
+// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
+// the corresponding Run*Pack function change, re-measure the wall-clock duration
+// from actual task logs and update the matching constant here.
+//
+// Sources:
+//   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
+//   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
+//   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
+//   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
+//   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
+const (
+	// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
+	SATEstimatedCPUValidateSec = 65
+	// CPU stress: stress-ng 1800 s (stress mode default).
+	SATEstimatedCPUStressSec = 1800
+
+	// RAM: memtester 256 MB / 1 pass.
+	SATEstimatedMemoryValidateSec = 70
+	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
+	SATEstimatedMemoryStressSec = 140
+
+	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUValidateSec = 85
+	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUStressSec = 450
+
+	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedStressSec = 350
+	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedPowerSec = 350
+
+	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
+	SATEstimatedNvidiaPulseTestSec = 5000
+
+	// NCCL all_reduce_perf, all GPUs simultaneously.
+	SATEstimatedNvidiaInterconnectSec = 300
+	// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
+	// without a user-configurable time limit; duration is determined by nvbandwidth itself.
+	SATEstimatedNvidiaBandwidthSec = 2700
+)
+
 var (
 	satExecCommand  = exec.Command
 	satLookPath     = exec.LookPath
@@ -356,22 +404,17 @@ func normalizeNvidiaBusID(v string) string {
 }

 func (s *System) ResetNvidiaGPU(index int) (string, error) {
-	if index < 0 {
-		return "", fmt.Errorf("gpu index must be >= 0")
-	}
-	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
-	if len(raw) == 0 && err == nil {
-		raw = []byte("GPU reset completed.\n")
-	}
-	return string(raw), err
+	return resetNvidiaGPU(index)
 }

-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-	// detect GPU count
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
@@ -380,7 +423,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
 	), logFunc)
 }

@@ -393,11 +436,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
 		profCmd []string
 		profEnv []string
 	)
-	if staggerSec > 0 && len(selected) > 1 {
+	if len(selected) > 1 {
+		// For multiple GPUs, always spawn one dcgmproftester process per GPU via
+		// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
+		// A single dcgmproftester process without -i only loads GPU 0 regardless
+		// of CUDA_VISIBLE_DEVICES.
+		stagger := staggerSec
+		if stagger < 0 {
+			stagger = 0
+		}
 		profCmd = []string{
 			"bee-dcgmproftester-staggered",
 			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
-			"--stagger-seconds", strconv.Itoa(staggerSec),
+			"--stagger-seconds", strconv.Itoa(stagger),
 			"--devices", joinIndexList(selected),
 		}
 	} else {
@@ -426,6 +477,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -443,6 +501,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -460,6 +525,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -552,10 +624,16 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
 	if passes <= 0 {
 		passes = 1
 	}
-	// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
-	// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
-	// controller can cause memtester to spin forever on a single subtest.
-	timeoutSec := sizeMB*passes*150/100 + 120
+	// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
+	// intentionally conservative enough for healthy systems while avoiding the
+	// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
+	timeoutSec := sizeMB*passes*20/100 + 60
+	if timeoutSec < 180 {
+		timeoutSec = 180
+	}
+	if timeoutSec > 900 {
+		timeoutSec = 900
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -42,27 +43,56 @@ type GPUStressMetric struct {

 // FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
 type FanStressRow struct {
-	TimestampUTC string
-	ElapsedSec   float64
-	Phase        string // "baseline", "load1", "pause", "load2", "cooldown"
-	GPUs         []GPUStressMetric
-	Fans         []FanReading
-	CPUMaxTempC  float64 // highest CPU temperature from ipmitool / sensors
-	SysPowerW    float64 // DCMI system power reading
+	TimestampUTC   string
+	ElapsedSec     float64
+	Phase          string // "baseline", "load1", "pause", "load2", "cooldown"
+	GPUs           []GPUStressMetric
+	Fans           []FanReading
+	CPUMaxTempC    float64 // highest CPU temperature from ipmitool / sensors
+	SysPowerW      float64
+	SysPowerSource string
+	SysPowerMode   string
 }

 type cachedPowerReading struct {
 	Value     float64
+	Source    string
+	Mode      string
+	Reason    string
 	UpdatedAt time.Time
 }

+type fanObservationState struct {
+	MaxRPM map[string]float64 `json:"max_rpm"`
+}
+
+type fanPeakCandidate struct {
+	FirstSeen time.Time
+	RPM       float64
+}
+
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
+	fanObservationMu   sync.Mutex
+	fanObservation     fanObservationState
+	fanObservationInit bool
+	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )

 const systemPowerHoldTTL = 15 * time.Second

+var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
+
+const fanObservationMinPeakHold = time.Second
+
+func normalizeObservedFanMaxRPM(rpm float64) float64 {
+	if rpm <= 0 {
+		return 0
+	}
+	return math.Ceil(rpm/1000.0) * 1000.0
+}
+
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -253,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
 	row.GPUs = sampleGPUStressMetrics(gpuIndices)
 	row.Fans, _ = sampleFanSpeeds()
 	row.CPUMaxTempC = sampleCPUMaxTemp()
-	row.SysPowerW = sampleSystemPower()
+	row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
 	return row
 }

@@ -310,11 +340,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
+			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
+		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -323,6 +355,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }

+func loadFanObservationLocked() {
+	if fanObservationInit {
+		return
+	}
+	fanObservationInit = true
+	fanObservation.MaxRPM = make(map[string]float64)
+	raw, err := os.ReadFile(fanObservationStatePath)
+	if err != nil || len(raw) == 0 {
+		return
+	}
+	var persisted fanObservationState
+	if json.Unmarshal(raw, &persisted) != nil {
+		return
+	}
+	for name, rpm := range persisted.MaxRPM {
+		name = strings.TrimSpace(name)
+		if name == "" || rpm <= 0 {
+			continue
+		}
+		fanObservation.MaxRPM[name] = rpm
+	}
+}
+
+func saveFanObservationLocked() {
+	if len(fanObservation.MaxRPM) == 0 {
+		return
+	}
+	dir := filepath.Dir(fanObservationStatePath)
+	if dir == "" || dir == "." {
+		dir = "/var/log/bee-sat"
+	}
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return
+	}
+	raw, err := json.MarshalIndent(fanObservation, "", "  ")
+	if err != nil {
+		return
+	}
+	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
+}
+
+func updateFanObservation(fans []FanReading, now time.Time) {
+	if len(fans) == 0 {
+		return
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	changed := false
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		currentMax := fanObservation.MaxRPM[name]
+		if fan.RPM <= currentMax {
+			delete(fanPeakCandidates, name)
+			continue
+		}
+		if cand, ok := fanPeakCandidates[name]; ok {
+			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
+				newMax := math.Max(cand.RPM, fan.RPM)
+				if newMax > currentMax {
+					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
+					changed = true
+				}
+				delete(fanPeakCandidates, name)
+				continue
+			}
+			if fan.RPM > cand.RPM {
+				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
+			}
+			continue
+		}
+		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
+	}
+	if changed {
+		saveFanObservationLocked()
+	}
+}
+
+func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
+	if len(fans) == 0 {
+		return 0, false
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	var samples []float64
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		maxRPM := fanObservation.MaxRPM[name]
+		if maxRPM <= 0 {
+			continue
+		}
+		pct := fan.RPM / maxRPM * 100.0
+		if pct > 100 {
+			pct = 100
+		}
+		if pct < 0 {
+			pct = 0
+		}
+		samples = append(samples, pct)
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -428,12 +573,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {

 // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
 // Returns the average duty cycle across all exposed PWM controls.
-func sampleFanDutyCyclePct() (float64, bool) {
+func sampleFanDutyCyclePct() (float64, bool, bool) {
 	out, err := exec.Command("sensors", "-j").Output()
 	if err != nil || len(out) == 0 {
-		return 0, false
+		fans, fanErr := sampleFanSpeeds()
+		if fanErr != nil {
+			return 0, false, false
+		}
+		return sampleFanDutyCyclePctFromFans(fans)
 	}
-	return parseFanDutyCyclePctSensorsJSON(out)
+	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
+	return pct, ok, false
+}
+
+func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
+	if len(fans) == 0 {
+		return 0, false, false
+	}
+	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
+		return pct, true, true
+	}
+	return 0, false, false
 }

 func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
@@ -608,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
 	return max
 }

-// sampleSystemPower reads system power draw via DCMI.
-func sampleSystemPower() float64 {
+// sampleSystemPowerResolved reads system power via the global autotune source,
+// falling back to the historical heuristic before autotune or when degraded.
+func sampleSystemPowerResolved() (float64, string, string) {
 	now := time.Now()
-	current := 0.0
-	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
-	if err == nil {
-		current = parseDCMIPowerReading(string(out))
-	}
+	current, decision, err := SampleSystemPowerResolved("")
 	systemPowerCacheMu.Lock()
 	defer systemPowerCacheMu.Unlock()
-	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
+	if err != nil {
+		current = 0
+	}
+	value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
 	systemPowerCache = updated
-	return value
+	return value, updated.Source, updated.Mode
 }

 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -643,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }

-func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
+func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
 	if current > 0 {
-		cache = cachedPowerReading{Value: current, UpdatedAt: now}
+		cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
 		return current, cache
 	}
 	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,7 @@
 package platform

 import (
+	"path/filepath"
 	"testing"
 	"time"
 )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
 	}
 }

+func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
+	t.Parallel()
+
+	oldPath := fanObservationStatePath
+	oldState := fanObservation
+	oldInit := fanObservationInit
+	oldCandidates := fanPeakCandidates
+	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	t.Cleanup(func() {
+		fanObservationStatePath = oldPath
+		fanObservation = oldState
+		fanObservationInit = oldInit
+		fanPeakCandidates = oldCandidates
+	})
+
+	start := time.Unix(100, 0)
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
+	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
+		t.Fatalf("single-sample spike should not establish observed max")
+	}
+
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
+
+	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected estimated duty cycle from persisted observed max")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("got=%v want ~43.3", got)
+	}
+
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected persisted observed max to be reloaded from disk")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("reloaded got=%v want ~43.3", got)
+	}
+}
+
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
@@ -64,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	now := time.Now()
 	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}

-	got, updated := effectiveSystemPowerReading(cache, 0, now)
+	got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
 	if got != 480 {
 		t.Fatalf("got=%v want cached 480", got)
 	}
@@ -72,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 		t.Fatalf("updated=%+v", updated)
 	}

-	got, updated = effectiveSystemPowerReading(cache, 530, now)
+	got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
 	if got != 530 {
 		t.Fatalf("got=%v want 530", got)
 	}
@@ -81,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	}

 	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
-	got, _ = effectiveSystemPowerReading(expired, 0, now)
+	got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
 	if got != 0 {
 		t.Fatalf("expired cache returned %v want 0", got)
 	}
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
+	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
+	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
+	if len(cmd) != len(want) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
+	}
+	for i := range want {
+		if cmd[i] != want[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
+		}
+	}
+}
+
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
 }

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
+	if name == "bee-nvidia" && action == ServiceRestart {
+		return restartNvidiaDrivers()
+	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
 	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -2,6 +2,8 @@
 // core/internal/ingest/parser_hardware.go. No import dependency on core.
 package schema

+import "encoding/json"
+
 // HardwareIngestRequest is the top-level output document produced by `bee audit`.
 // It is accepted as-is by the core /api/ingest/hardware endpoint.
 type HardwareIngestRequest struct {
@@ -64,8 +66,10 @@ type HardwareSnapshot struct {
 	Storage       []HardwareStorage        `json:"storage,omitempty"`
 	PCIeDevices   []HardwarePCIeDevice     `json:"pcie_devices,omitempty"`
 	PowerSupplies []HardwarePowerSupply    `json:"power_supplies,omitempty"`
-	Sensors       *HardwareSensors         `json:"sensors,omitempty"`
-	EventLogs     []HardwareEventLog       `json:"event_logs,omitempty"`
+	Sensors        *HardwareSensors         `json:"sensors,omitempty"`
+	EventLogs      []HardwareEventLog       `json:"event_logs,omitempty"`
+	PlatformConfig *json.RawMessage         `json:"platform_config,omitempty"`
+	VROCLicense    *string                  `json:"vroc_license,omitempty"`
 }

 type HardwareHealthSummary struct {
@@ -122,7 +126,7 @@ type HardwareCPU struct {
 type HardwareMemory struct {
 	HardwareComponentStatus
 	Slot                       *string  `json:"slot,omitempty"`
-	Location                   *string  `json:"location,omitempty"`
+	Location                   *string  `json:"-"` // internal: used for DIMM telemetry matching only
 	Present                    *bool    `json:"present,omitempty"`
 	SizeMB                     *int     `json:"size_mb,omitempty"`
 	Type                       *string  `json:"type,omitempty"`
@@ -143,30 +147,33 @@ type HardwareMemory struct {

 type HardwareStorage struct {
 	HardwareComponentStatus
-	Slot                  *string        `json:"slot,omitempty"`
-	Type                  *string        `json:"type,omitempty"`
-	Model                 *string        `json:"model,omitempty"`
-	SizeGB                *int           `json:"size_gb,omitempty"`
-	SerialNumber          *string        `json:"serial_number,omitempty"`
-	Manufacturer          *string        `json:"manufacturer,omitempty"`
-	Firmware              *string        `json:"firmware,omitempty"`
-	Interface             *string        `json:"interface,omitempty"`
-	Present               *bool          `json:"present,omitempty"`
-	TemperatureC          *float64       `json:"temperature_c,omitempty"`
-	PowerOnHours          *int64         `json:"power_on_hours,omitempty"`
-	PowerCycles           *int64         `json:"power_cycles,omitempty"`
-	UnsafeShutdowns       *int64         `json:"unsafe_shutdowns,omitempty"`
-	MediaErrors           *int64         `json:"media_errors,omitempty"`
-	ErrorLogEntries       *int64         `json:"error_log_entries,omitempty"`
-	WrittenBytes          *int64         `json:"written_bytes,omitempty"`
-	ReadBytes             *int64         `json:"read_bytes,omitempty"`
-	LifeUsedPct           *float64       `json:"life_used_pct,omitempty"`
-	LifeRemainingPct      *float64       `json:"life_remaining_pct,omitempty"`
-	AvailableSparePct     *float64       `json:"available_spare_pct,omitempty"`
-	ReallocatedSectors    *int64         `json:"reallocated_sectors,omitempty"`
-	CurrentPendingSectors *int64         `json:"current_pending_sectors,omitempty"`
-	OfflineUncorrectable  *int64         `json:"offline_uncorrectable,omitempty"`
-	Telemetry             map[string]any `json:"-"`
+	Slot                   *string        `json:"slot,omitempty"`
+	Type                   *string        `json:"type,omitempty"`
+	Model                  *string        `json:"model,omitempty"`
+	SizeGB                 *int           `json:"size_gb,omitempty"`
+	LogicalBlockSizeBytes  *int64         `json:"logical_block_size_bytes,omitempty"`
+	PhysicalBlockSizeBytes *int64         `json:"physical_block_size_bytes,omitempty"`
+	MetadataBytesPerBlock  *int64         `json:"metadata_bytes_per_block,omitempty"`
+	SerialNumber           *string        `json:"serial_number,omitempty"`
+	Manufacturer           *string        `json:"manufacturer,omitempty"`
+	Firmware               *string        `json:"firmware,omitempty"`
+	Interface              *string        `json:"interface,omitempty"`
+	Present                *bool          `json:"present,omitempty"`
+	TemperatureC           *float64       `json:"temperature_c,omitempty"`
+	PowerOnHours           *int64         `json:"power_on_hours,omitempty"`
+	PowerCycles            *int64         `json:"power_cycles,omitempty"`
+	UnsafeShutdowns        *int64         `json:"unsafe_shutdowns,omitempty"`
+	MediaErrors            *int64         `json:"media_errors,omitempty"`
+	ErrorLogEntries        *int64         `json:"error_log_entries,omitempty"`
+	WrittenBytes           *int64         `json:"written_bytes,omitempty"`
+	ReadBytes              *int64         `json:"read_bytes,omitempty"`
+	LifeUsedPct            *float64       `json:"life_used_pct,omitempty"`
+	LifeRemainingPct       *float64       `json:"life_remaining_pct,omitempty"`
+	AvailableSparePct      *float64       `json:"available_spare_pct,omitempty"`
+	ReallocatedSectors     *int64         `json:"reallocated_sectors,omitempty"`
+	CurrentPendingSectors  *int64         `json:"current_pending_sectors,omitempty"`
+	OfflineUncorrectable   *int64         `json:"offline_uncorrectable,omitempty"`
+	Telemetry              map[string]any `json:"-"`
 }

 type HardwarePCIeDevice struct {
@@ -211,6 +218,7 @@ type HardwarePCIeDevice struct {
 	Firmware               *string        `json:"firmware,omitempty"`
 	MacAddresses           []string       `json:"mac_addresses,omitempty"`
 	Present                *bool          `json:"present,omitempty"`
+	IOMMUGroup             *int           `json:"iommu_group,omitempty"`
 	Telemetry              map[string]any `json:"-"`
 }

@@ -256,15 +264,13 @@ type HardwareSensors struct {
 }

 type HardwareFanSensor struct {
-	Name     string  `json:"name"`
-	Location *string `json:"location,omitempty"`
-	RPM      *int    `json:"rpm,omitempty"`
-	Status   *string `json:"status,omitempty"`
+	Name   string  `json:"name"`
+	RPM    *int    `json:"rpm,omitempty"`
+	Status *string `json:"status,omitempty"`
 }

 type HardwarePowerSensor struct {
 	Name     string   `json:"name"`
-	Location *string  `json:"location,omitempty"`
 	VoltageV *float64 `json:"voltage_v,omitempty"`
 	CurrentA *float64 `json:"current_a,omitempty"`
 	PowerW   *float64 `json:"power_w,omitempty"`
@@ -273,7 +279,6 @@ type HardwarePowerSensor struct {

 type HardwareTemperatureSensor struct {
 	Name                     string   `json:"name"`
-	Location                 *string  `json:"location,omitempty"`
 	Celsius                  *float64 `json:"celsius,omitempty"`
 	ThresholdWarningCelsius  *float64 `json:"threshold_warning_celsius,omitempty"`
 	ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
@@ -281,11 +286,10 @@ type HardwareTemperatureSensor struct {
 }

 type HardwareOtherSensor struct {
-	Name     string   `json:"name"`
-	Location *string  `json:"location,omitempty"`
-	Value    *float64 `json:"value,omitempty"`
-	Unit     *string  `json:"unit,omitempty"`
-	Status   *string  `json:"status,omitempty"`
+	Name   string   `json:"name"`
+	Value  *float64 `json:"value,omitempty"`
+	Unit   *string  `json:"unit,omitempty"`
+	Status *string  `json:"status,omitempty"`
 }

 type HardwareEventLog struct {
--- a/audit/internal/schema/hardware_test.go
+++ b/audit/internal/schema/hardware_test.go
@@ -44,3 +44,57 @@ func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
 		t.Fatalf("missing event_logs payload: %s", text)
 	}
 }
+
+func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
+	powerOnHours := int64(12450)
+	writtenBytes := int64(9876543210)
+	readBytes := int64(1234567890)
+	lifeRemainingPct := 91.0
+	logicalBlockSizeBytes := int64(512)
+	physicalBlockSizeBytes := int64(4096)
+	metadataBytesPerBlock := int64(8)
+
+	payload := HardwareIngestRequest{
+		CollectedAt: "2026-03-15T15:00:00Z",
+		Hardware: HardwareSnapshot{
+			Board: HardwareBoard{SerialNumber: "SRV-001"},
+			Storage: []HardwareStorage{
+				{
+					SerialNumber:           stringPtr("DISK-001"),
+					Model:                  stringPtr("TestDisk"),
+					LogicalBlockSizeBytes:  &logicalBlockSizeBytes,
+					PhysicalBlockSizeBytes: &physicalBlockSizeBytes,
+					MetadataBytesPerBlock:  &metadataBytesPerBlock,
+					PowerOnHours:           &powerOnHours,
+					WrittenBytes:           &writtenBytes,
+					ReadBytes:              &readBytes,
+					LifeRemainingPct:       &lifeRemainingPct,
+				},
+			},
+		},
+	}
+
+	data, err := json.Marshal(payload)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	text := string(data)
+	for _, needle := range []string{
+		`"storage":[{`,
+		`"logical_block_size_bytes":512`,
+		`"physical_block_size_bytes":4096`,
+		`"metadata_bytes_per_block":8`,
+		`"power_on_hours":12450`,
+		`"written_bytes":9876543210`,
+		`"read_bytes":1234567890`,
+		`"life_remaining_pct":91`,
+	} {
+		if !strings.Contains(text, needle) {
+			t.Fatalf("missing %q in payload: %s", needle, text)
+		}
+	}
+}
+
+func stringPtr(v string) *string {
+	return &v
+}
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -125,9 +125,11 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityInstall
 	case "install-to-ram":
 		return taskPriorityInstallToRAM
+	case "nvme-format":
+		return taskPriorityInstall
 	case "audit":
 		return taskPriorityAudit
-	case "nvidia-bench-perf", "nvidia-bench-power":
+	case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
 		return taskPriorityBenchmark
 	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
 		return taskPriorityBurn
@@ -628,8 +630,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 		}

 		if rampUp && len(body.GPUIndices) > 1 {
-			// Ramp-up mode: resolve GPU list, then create one task per prefix
-			// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
+			// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
+			// in Phase 2 (one additional GPU per step). A single task with all
+			// selected GPUs is sufficient — spawning N tasks with growing subsets
+			// would repeat all earlier steps redundantly.
 			gpus, err := apiListNvidiaGPUs(h.opts.App)
 			if err != nil {
 				writeError(w, http.StatusBadRequest, err.Error())
@@ -646,35 +650,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 			} else {
 				now := time.Now()
 				rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
-				var allTasks []*Task
-				for step := 1; step <= len(resolved); step++ {
-					subset := resolved[:step]
-					stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
-					t := &Task{
-						ID:        newJobID("bee-bench-nvidia"),
-						Name:      stepName,
-						Target:    target,
-						Priority:  defaultTaskPriority(target, taskParams{}),
-						Status:    TaskPending,
-						CreatedAt: now,
-						params: taskParams{
-							GPUIndices:       append([]int(nil), subset...),
-							SizeMB:           body.SizeMB,
-							BenchmarkProfile: body.Profile,
-							RunNCCL:          runNCCL && step == len(resolved),
-							ParallelGPUs:     true,
-							RampStep:         step,
-							RampTotal:        len(resolved),
-							RampRunID:        rampRunID,
-							DisplayName:      stepName,
-						},
-					}
-					allTasks = append(allTasks, t)
+				taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
+				t := &Task{
+					ID:        newJobID("bee-bench-nvidia"),
+					Name:      taskName,
+					Target:    target,
+					Priority:  defaultTaskPriority(target, taskParams{}),
+					Status:    TaskPending,
+					CreatedAt: now,
+					params: taskParams{
+						GPUIndices:       append([]int(nil), resolved...),
+						SizeMB:           body.SizeMB,
+						BenchmarkProfile: body.Profile,
+						RunNCCL:          runNCCL,
+						ParallelGPUs:     true,
+						RampTotal:        len(resolved),
+						RampRunID:        rampRunID,
+						DisplayName:      taskName,
+					},
 				}
-				for _, t := range allTasks {
-					globalQueue.enqueue(t)
-				}
-				writeTaskRunResponse(w, allTasks)
+				globalQueue.enqueue(t)
+				writeTaskRunResponse(w, []*Task{t})
 				return
 			}
 		}
@@ -707,6 +703,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 	}
 }

+func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if h.opts.App == nil {
+			writeError(w, http.StatusServiceUnavailable, "app not configured")
+			return
+		}
+		var body struct {
+			Profile       string `json:"profile"`
+			BenchmarkKind string `json:"benchmark_kind"`
+			SizeMB        int    `json:"size_mb"`
+		}
+		if r.Body != nil {
+			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+				writeError(w, http.StatusBadRequest, "invalid request body")
+				return
+			}
+		}
+		profile := strings.TrimSpace(body.Profile)
+		if profile == "" {
+			profile = "standard"
+		}
+		benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
+		if benchmarkKind == "" {
+			benchmarkKind = "power-fit"
+		}
+		now := time.Now()
+		taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
+		t := &Task{
+			ID:        newJobID("bee-bench-autotune"),
+			Name:      taskName,
+			Target:    "nvidia-bench-autotune",
+			Priority:  defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
+			Status:    TaskPending,
+			CreatedAt: now,
+			params: taskParams{
+				BenchmarkProfile: profile,
+				BenchmarkKind:    benchmarkKind,
+				SizeMB:           body.SizeMB,
+				DisplayName:      taskName,
+			},
+		}
+		globalQueue.enqueue(t)
+		writeTaskRunResponse(w, []*Task{t})
+	}
+}
+
+func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
+	if err != nil {
+		if os.IsNotExist(err) {
+			w.WriteHeader(http.StatusOK)
+			writeJSON(w, map[string]any{
+				"configured": false,
+				"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
+			})
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	w.WriteHeader(http.StatusOK)
+	writeJSON(w, map[string]any{
+		"configured": true,
+		"config":     cfg,
+		"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
+	})
+}
+
 func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 }
@@ -740,12 +808,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
 			now := time.Now()
 			t.DoneAt = &now
 		case TaskRunning:
-			if t.job != nil {
-				t.job.abort()
+			if t.job == nil || !t.job.abort() {
+				globalQueue.mu.Unlock()
+				writeJSON(w, map[string]string{"status": "not_running"})
+				return
 			}
-			t.Status = TaskCancelled
-			now := time.Now()
-			t.DoneAt = &now
+			globalQueue.mu.Unlock()
+			writeJSON(w, map[string]string{"status": "aborting"})
+			return
 		}
 		globalQueue.mu.Unlock()
 		writeJSON(w, map[string]string{"status": "aborted"})
@@ -970,6 +1040,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
 	writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
 }

+func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
+	state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	if state.Targets == nil {
+		state.Targets = []app.BlackboxTargetStatus{}
+	}
+	writeJSON(w, state)
+}
+
+func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	var target platform.RemovableTarget
+	if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
+		writeError(w, http.StatusBadRequest, "device is required")
+		return
+	}
+	targets, err := h.opts.App.ListRemovableTargets()
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	allowed := false
+	for _, candidate := range targets {
+		if candidate.Device == target.Device {
+			target = candidate
+			allowed = true
+			break
+		}
+	}
+	if !allowed {
+		writeError(w, http.StatusBadRequest, "device not in removable target list")
+		return
+	}
+	marker, err := app.EnableBlackboxTarget(target)
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	writeJSON(w, map[string]any{
+		"status":        "ok",
+		"message":       "Black-box marker written.",
+		"enrollment_id": marker.EnrollmentID,
+	})
+}
+
+func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
+	var req struct {
+		Device       string `json:"device"`
+		EnrollmentID string `json:"enrollment_id"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			writeError(w, http.StatusNotFound, "black-box target not found")
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
+}
+
 // ── GPU presence ──────────────────────────────────────────────────────────────

 func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
@@ -1152,7 +1297,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
 var standardTools = []string{
 	"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
 	"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
-	"mstflint", "qrencode",
+	"mstflint", "saa",
 }

 func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
@@ -1534,6 +1679,56 @@ func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Reque
 	fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
 }

+// ── Hardware summary / component detail ──────────────────────────────────────
+
+// handleAPIHardwareSummary returns the hardware summary card HTML fragment for
+// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML").
+func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) {
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	w.Header().Set("Cache-Control", "no-store")
+	fmt.Fprint(w, renderHardwareSummaryCard(h.opts))
+}
+
+// handleAPIComponentDetail returns an HTML fragment describing the current and
+// historical status for one component type (cpu, memory, storage, gpu, psu).
+func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) {
+	compType := r.PathValue("type")
+	var exact, prefixes []string
+	var title string
+	switch compType {
+	case "cpu":
+		title = "CPU"
+		exact = []string{"cpu:all"}
+	case "memory":
+		title = "Memory"
+		exact = []string{"memory:all"}
+		prefixes = []string{"memory:"}
+	case "storage":
+		title = "Storage"
+		exact = []string{"storage:all"}
+		prefixes = []string{"storage:"}
+	case "gpu":
+		title = "GPU"
+		prefixes = []string{"pcie:gpu:"}
+	case "psu":
+		title = "PSU"
+		prefixes = []string{"psu:"}
+	default:
+		http.NotFound(w, r)
+		return
+	}
+
+	var records []app.ComponentStatusRecord
+	if h.opts.App != nil && h.opts.App.StatusDB != nil {
+		all := h.opts.App.StatusDB.All()
+		records = matchedRecords(all, exact, prefixes)
+	}
+
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	w.Header().Set("Cache-Control", "no-store")
+	fmt.Fprint(w, renderComponentDetail(title, records))
+}
+
 func (h *handler) rollbackPendingNetworkChange() error {
 	h.pendingNetMu.Lock()
 	pnc := h.pendingNet
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -3,6 +3,8 @@ package webui
 import (
 	"encoding/json"
 	"net/http/httptest"
+	"os"
+	"path/filepath"
 	"strings"
 	"testing"

@@ -44,6 +46,66 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	}
 }

+func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
+	h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
+
+	h.handleAPIBlackboxStatus(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var state app.BlackboxState
+	if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
+		t.Fatalf("decode state: %v", err)
+	}
+	if state.Status != "disabled" {
+		t.Fatalf("status=%q want disabled", state.Status)
+	}
+}
+
+func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
+	exportDir := t.TempDir()
+	statePath := filepath.Join(exportDir, "blackbox-state.json")
+	if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
+		t.Fatalf("write state: %v", err)
+	}
+	h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
+
+	h.handleAPIBlackboxStatus(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
+		t.Fatalf("body=%s", rec.Body.String())
+	}
+}
+
+func TestParseNVMeFormatModes(t *testing.T) {
+	raw := `
+lbaf  0 : ms:0   lbads:9  rp:0x2 (in use)
+lbaf  1 : ms:8   lbads:9  rp:0x1
+lbaf  2 : ms:0   lbads:12 rp:0
+`
+	modes := parseNVMeFormatModes(raw)
+	if len(modes) != 3 {
+		t.Fatalf("modes=%#v want 3 modes", modes)
+	}
+	if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
+		t.Fatalf("mode 0=%#v", modes[0])
+	}
+	if modes[1].Label != "MODE 1 (512+8)" {
+		t.Fatalf("mode 1 label=%q", modes[1].Label)
+	}
+	if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
+		t.Fatalf("mode 2=%#v", modes[2])
+	}
+}
+
 func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
@@ -178,16 +240,54 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
-	if len(globalQueue.tasks) != 3 {
-		t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
+	// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
+	// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
 	}
-	for i, task := range globalQueue.tasks {
-		if task.Target != "nvidia-bench-power" {
-			t.Fatalf("task[%d] target=%q", i, task.Target)
-		}
-		if task.Priority != taskPriorityBenchmark {
-			t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
-		}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-bench-power" {
+		t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
+	}
+	if task.Priority != taskPriorityBenchmark {
+		t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
+	}
+	if task.params.RampTotal != 3 {
+		t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
+	}
+}
+
+func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-bench-autotune" {
+		t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
+	}
+	if task.params.BenchmarkKind != "power-fit" {
+		t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
 	}
 }

--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
 	return out
 }

+// renderStackedMetricChartSVG renders a stacked area chart where each dataset
+// is visually "stacked" on top of the previous one. Intended for multi-PSU
+// power charts where the filled area of each PSU shows its individual
+// contribution and the total height equals the combined draw.
+func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range datasets {
+		if len(datasets[i]) == 0 {
+			datasets[i] = make([]float64, pointCount)
+		}
+	}
+
+	times, datasets = downsampleTimeSeries(times, datasets, 1400)
+	pointCount = len(times)
+
+	// Build cumulative sums per time point.
+	cumulative := make([][]float64, len(datasets)+1)
+	for i := range cumulative {
+		cumulative[i] = make([]float64, pointCount)
+	}
+	for i, ds := range datasets {
+		for j, v := range ds {
+			cumulative[i+1][j] = cumulative[i][j] + v
+		}
+	}
+
+	// Scale is based on the total (top cumulative row).
+	total := cumulative[len(cumulative)-1]
+	yMin := floatPtr(0)
+	if yMax == nil {
+		yMax = autoMax120(total)
+	}
+	scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
+
+	legendItems := make([]metricChartSeries, len(datasets))
+	for i, name := range names {
+		color := metricChartPalette[i%len(metricChartPalette)]
+		legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
+	}
+
+	// Stats label from totals.
+	statsLabel := chartStatsLabel([][]float64{total})
+
+	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, layout.Width, layout.Height)
+	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scale)
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+	writeSingleAxisY(&b, layout, scale)
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+
+	// Draw stacked areas from top to bottom so lower layers are visible.
+	for i := len(datasets) - 1; i >= 0; i-- {
+		writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
+	}
+	// Draw border polylines on top.
+	for i := len(datasets) - 1; i >= 0; i-- {
+		writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
+	}
+
+	writeLegend(&b, layout, legendItems)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+// writeStackedArea draws a filled polygon between two cumulative value arrays
+// (baseline and top), using the given color at 55% opacity.
+func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
+	n := len(top)
+	if n == 0 {
+		return
+	}
+	if len(baseline) < n {
+		baseline = make([]float64, n)
+	}
+
+	// Forward path along top values, then backward along baseline values.
+	var points strings.Builder
+	for i := 0; i < n; i++ {
+		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
+		if i > 0 {
+			points.WriteByte(' ')
+		}
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	for i := n - 1; i >= 0; i-- {
+		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
+		points.WriteByte(' ')
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
+}
+
 func writeSVGOpen(b *strings.Builder, width, height int) {
 	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
 }
--- a/audit/internal/webui/health_poller.go
+++ b/audit/internal/webui/health_poller.go
@@ -0,0 +1,76 @@
+package webui
+
+import (
+	"bytes"
+	"context"
+	"log/slog"
+	"os/exec"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/collector"
+)
+
+const healthPollInterval = 60 * time.Second
+const psuIPMITimeout = 15 * time.Second
+
+// healthPoller runs periodic health checks for hardware components that do not
+// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
+type healthPoller struct {
+	statusDB *app.ComponentStatusDB
+}
+
+func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
+	return &healthPoller{statusDB: statusDB}
+}
+
+func (p *healthPoller) start() {
+	goRecoverLoop("health poller", 5*time.Second, p.run)
+}
+
+func (p *healthPoller) run() {
+	ticker := time.NewTicker(healthPollInterval)
+	defer ticker.Stop()
+	for range ticker.C {
+		p.pollPSU()
+	}
+}
+
+func (p *healthPoller) pollPSU() {
+	if p.statusDB == nil {
+		return
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
+	var out bytes.Buffer
+	cmd.Stdout = &out
+	if err := cmd.Run(); err != nil {
+		// IPMI not available or not a server — skip silently.
+		slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
+		return
+	}
+
+	slots := collector.PSUSlotsFromSDR(out.String())
+	if len(slots) == 0 {
+		return
+	}
+
+	const source = "watchdog:psu"
+	for slot, psu := range slots {
+		key := "psu:" + slot
+		status := psu.Status
+		if status == "" {
+			status = "Unknown"
+		}
+		detail := ""
+		switch status {
+		case "Critical":
+			detail = "PSU sensor reported non-OK state"
+		case "Warning":
+			detail = "PSU sensor in warning state"
+		}
+		p.statusDB.Record(key, source, status, detail)
+	}
+}
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -1,6 +1,9 @@
 package webui

 import (
+	"bufio"
+	"fmt"
+	"io"
 	"os"
 	"strings"
 	"sync"
@@ -17,6 +20,25 @@ type jobState struct {
 	cancel       func() // optional cancel function; nil if job is not cancellable
 	logPath      string
 	serialPrefix string
+	logFile      *os.File // kept open for the task lifetime to avoid per-line open/close
+	logBuf       *bufio.Writer
+}
+
+// readTaskLogFile reads a task log, refusing files over 50 MB.
+func readTaskLogFile(path string) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
+	if err != nil {
+		return nil, err
+	}
+	if int64(len(data)) > 50<<20 {
+		return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
+	}
+	return data, nil
 }

 // abort cancels the job if it has a cancel function and is not yet done.
@@ -31,13 +53,21 @@ func (j *jobState) abort() bool {
 }

 func (j *jobState) append(line string) {
+	j.appendWithOptions(line, true, true)
+}
+
+func (j *jobState) appendFromLog(line string) {
+	j.appendWithOptions(line, false, false)
+}
+
+func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
 	j.lines = append(j.lines, line)
-	if j.logPath != "" {
-		appendJobLog(j.logPath, line)
+	if persistLog && j.logPath != "" {
+		j.writeLogLineLocked(line)
 	}
-	if j.serialPrefix != "" {
+	if serialMirror && j.serialPrefix != "" {
 		taskSerialWriteLine(j.serialPrefix + line)
 	}
 	for _, ch := range j.subs {
@@ -48,6 +78,36 @@ func (j *jobState) append(line string) {
 	}
 }

+// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
+// Must be called with j.mu held. Uses a buffered writer kept open for the task
+// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
+func (j *jobState) writeLogLineLocked(line string) {
+	if j.logFile == nil {
+		f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
+		if err != nil {
+			return
+		}
+		j.logFile = f
+		j.logBuf = bufio.NewWriterSize(f, 64*1024)
+	}
+	_, _ = j.logBuf.WriteString(line + "\n")
+	_ = j.logBuf.Flush()
+}
+
+// closeLog flushes and closes the log file. Called after all task output is done.
+func (j *jobState) closeLog() {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	if j.logBuf != nil {
+		_ = j.logBuf.Flush()
+	}
+	if j.logFile != nil {
+		_ = j.logFile.Close()
+		j.logFile = nil
+		j.logBuf = nil
+	}
+}
+
 func (j *jobState) finish(errMsg string) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
@@ -119,7 +179,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	if logPath == "" {
 		return j
 	}
-	data, err := os.ReadFile(logPath)
+	data, err := readTaskLogFile(logPath)
 	if err != nil || len(data) == 0 {
 		return j
 	}
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
 			w.mu.Lock()
 			if w.window != nil {
 				w.recordEvent(evt)
+			} else {
+				evtCopy := evt
+				goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
 			}
 			w.mu.Unlock()
 		}
@@ -162,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
 		for _, id := range evt.ids {
 			var key string
 			switch evt.category {
-			case "gpu", "pcie":
+			case "gpu":
+				key = "pcie:gpu:" + normalizeBDF(id)
+			case "pcie":
 				key = "pcie:" + normalizeBDF(id)
 			case "storage":
 				key = "storage:" + id
@@ -180,6 +185,54 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
 	}
 }

+// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
+// Called when an error is detected outside of any SAT task (always-on watching).
+func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
+	if w.statusDB == nil {
+		return
+	}
+	const source = "watchdog:kmsg"
+	detail := "kernel: " + truncate(evt.raw, 120)
+
+	var severity string
+	for _, p := range platform.HardwareErrorPatterns {
+		if p.Re.MatchString(evt.raw) {
+			if p.Severity == "critical" {
+				severity = "Critical"
+			} else {
+				severity = "Warning"
+			}
+			break
+		}
+	}
+	if severity == "" {
+		severity = "Warning"
+	}
+
+	if len(evt.ids) == 0 {
+		key := "cpu:all"
+		if evt.category == "memory" {
+			key = "memory:all"
+		}
+		w.statusDB.Record(key, source, severity, detail)
+		return
+	}
+	for _, id := range evt.ids {
+		var key string
+		switch evt.category {
+		case "gpu":
+			key = "pcie:gpu:" + normalizeBDF(id)
+		case "pcie":
+			key = "pcie:" + normalizeBDF(id)
+		case "storage":
+			key = "storage:" + id
+		default:
+			key = "pcie:" + normalizeBDF(id)
+		}
+		w.statusDB.Record(key, source, severity, detail)
+	}
+}
+
 // parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
 // any pattern in platform.HardwareErrorPatterns.
 // kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
--- a/audit/internal/webui/layout.go
+++ b/audit/internal/webui/layout.go
@@ -0,0 +1,145 @@
+package webui
+
+import (
+	"fmt"
+	"html"
+	"os"
+	"strings"
+)
+
+func layoutHead(title string) string {
+	return `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<title>` + html.EscapeString(title) + `</title>
+<style>
+:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
+*{box-sizing:border-box;margin:0;padding:0}
+dialog{margin:auto}
+body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
+a{color:var(--accent);text-decoration:none}
+/* Sidebar */
+.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
+.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
+.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
+.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
+.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
+.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
+.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
+.nav{flex:1}
+.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
+.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
+.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
+/* Content */
+.main{flex:1;display:flex;flex-direction:column;overflow:auto}
+.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
+.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
+.content{padding:24px;flex:1}
+/* Cards */
+.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
+.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
+.card-head-actions{justify-content:space-between}
+.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
+.card-body{padding:16px}
+/* Buttons */
+.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
+.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
+.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
+.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
+.btn-sm{padding:5px 10px;font-size:12px}
+/* Tables */
+table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
+th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
+td{padding:9px 14px;border-top:1px solid var(--border-lite)}
+tr:first-child td{border-top:0}
+tbody tr:hover td{background:rgba(0,0,0,.03)}
+/* Status badges */
+.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
+.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Component chips — one small square per device */
+.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
+.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
+.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Tasks nav badge */
+.tasks-nav-btn{display:flex;justify-content:space-between;align-items:center;padding:10px 16px;color:rgba(255,255,255,.55);font-size:12px;text-decoration:none;border-top:1px solid rgba(255,255,255,.12);margin-top:auto;transition:color .15s}
+.tasks-nav-btn:hover{color:#fff}
+.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none}
+.tasks-nav-count.active{display:inline}
+/* Output terminal */
+.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
+.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
+/* Forms */
+.form-row{margin-bottom:14px}
+.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
+.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
+.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
+/* Grid */
+.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
+.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
+@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
+/* iframe viewer */
+.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
+/* Alerts */
+.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
+.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
+.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
+</style>
+</head>
+<body>
+`
+}
+
+func layoutNav(active string, buildLabel string) string {
+	items := []struct{ id, label, href string }{
+		{"dashboard", "Dashboard", "/"},
+		{"audit", "1. Audit", "/audit"},
+		{"check", "2. Check", "/check"},
+		{"load", "3. Load", "/load"},
+		{"speed", "4. Speed", "/speed"},
+		{"endurance", "5. Endurance", "/endurance"},
+		{"tools", "6. Tools", "/tools"},
+		{"settings", "7. Settings", "/settings"},
+	}
+	var b strings.Builder
+	b.WriteString(`<aside class="sidebar">`)
+	b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
+	if strings.TrimSpace(buildLabel) == "" {
+		buildLabel = "dev"
+	}
+	b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
+	if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
+		gspMode := strings.TrimSpace(string(raw))
+		switch gspMode {
+		case "gsp-off":
+			b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
+		case "gsp-stuck":
+			b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
+		}
+	}
+	b.WriteString(`<nav class="nav">`)
+	for _, item := range items {
+		cls := "nav-item"
+		if item.id == active {
+			cls += " active"
+		}
+		b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`, cls, item.href, item.label))
+	}
+	b.WriteString(`</nav>`)
+	b.WriteString(`<a href="/tasks" class="tasks-nav-btn" id="tasks-nav-btn">`)
+	b.WriteString(`<span>Tasks</span>`)
+	b.WriteString(`<span class="tasks-nav-count" id="tasks-nav-count"></span>`)
+	b.WriteString(`</a>`)
+	b.WriteString(`<script>`)
+	b.WriteString(`(function(){function u(){fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(d){var n=Array.isArray(d)?d.filter(function(t){return t.status==='pending'||t.status==='running';}).length:0;var c=document.getElementById('tasks-nav-count');var b=document.getElementById('tasks-nav-btn');if(c){c.textContent=n>0?String(n):'';c.className='tasks-nav-count'+(n>0?' active':'');}if(b){b.style.color=n>0?'#f6c90e':'';}}).catch(function(){});}u();setInterval(u,5000);})();`)
+	b.WriteString(`</script>`)
+	b.WriteString(`</aside>`)
+	return b.String()
+}
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
+  power_source TEXT,
+  power_mode   TEXT,
+  power_reason TEXT,
  PRIMARY KEY (ts)
 );
 CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
 	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
 		return err
 	}
-	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+	if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
+		return err
+	}
+	return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
 }

 func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	defer func() { _ = tx.Rollback() }()

 	_, err = tx.Exec(
-		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
-		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
+		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
+		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
 	)
 	if err != nil {
 		return err
@@ -161,14 +173,64 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	return tx.Commit()
 }

+// Downsample reduces density of old metrics rows to 1 sample per minute.
+// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
+// affected — rows newer than downsampleBefore keep full 5-second resolution.
+// For each 60-second bucket the row with the smallest ts is kept; the rest
+// are deleted. This trims ~92 % of rows in that window while preserving
+// the overall shape of every chart.
+//
+// Called hourly by the metrics collector background goroutine.
+func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	start := deleteOlderThan.Unix()
+	end := downsampleBefore.Unix()
+	if end <= start {
+		return nil
+	}
+	// For each table: delete rows in [start, end) whose ts is NOT the minimum
+	// ts in its 60-second bucket (ts/60 integer division = bucket ID).
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		_, err := m.db.Exec(`
+DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
+  AND ts NOT IN (
+    SELECT MIN(ts) FROM `+table+`
+    WHERE ts >= ? AND ts < ?
+    GROUP BY ts / 60
+  )`, start, end, start, end)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Prune deletes all rows older than the given cutoff from every metrics table.
+// Called hourly by the metrics collector to keep the DB size bounded.
+func (m *MetricsDB) Prune(before time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	cutTS := before.Unix()
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
+			return err
+		}
+	}
+	_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
+	return nil
+}
+
 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
 }

 // LoadAll returns all persisted samples in chronological order (oldest first).
 func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
 }

 // LoadBetween returns samples in chronological order within the given time window.
@@ -183,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
 		start, end = end, start
 	}
 	return m.loadSamples(
-		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
+		`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
 		start.Unix(), end.Unix(),
 	)
 }
@@ -199,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	type sysRow struct {
 		ts            int64
 		cpu, mem, pwr float64
+		powerSource   string
+		powerMode     string
+		powerReason   string
 	}
 	var sysRows []sysRow
 	for rows.Next() {
 		var r sysRow
-		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
+		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
 			continue
 		}
 		sysRows = append(sysRows, r)
@@ -313,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	samples := make([]platform.LiveMetricSample, len(sysRows))
 	for i, r := range sysRows {
 		s := platform.LiveMetricSample{
-			Timestamp:  time.Unix(r.ts, 0).UTC(),
-			CPULoadPct: r.cpu,
-			MemLoadPct: r.mem,
-			PowerW:     r.pwr,
+			Timestamp:   time.Unix(r.ts, 0).UTC(),
+			CPULoadPct:  r.cpu,
+			MemLoadPct:  r.mem,
+			PowerW:      r.pwr,
+			PowerSource: r.powerSource,
+			PowerMode:   r.powerMode,
+			PowerReason: r.powerReason,
 		}
 		for _, idx := range gpuIndices {
 			if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
--- a/audit/internal/webui/nvme_format.go
+++ b/audit/internal/webui/nvme_format.go
@@ -0,0 +1,368 @@
+package webui
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+)
+
+type nvmeFormatMode struct {
+	Mode          int    `json:"mode"`
+	DataBytes     int64  `json:"data_bytes"`
+	MetadataBytes int64  `json:"metadata_bytes"`
+	InUse         bool   `json:"in_use"`
+	Label         string `json:"label"`
+}
+
+type nvmeFormatDisk struct {
+	Device        string           `json:"device"`
+	Model         string           `json:"model,omitempty"`
+	Serial        string           `json:"serial,omitempty"`
+	Size          string           `json:"size,omitempty"`
+	CurrentMode   int              `json:"current_mode"`
+	CurrentFormat string           `json:"current_format"`
+	Modes         []nvmeFormatMode `json:"modes"`
+	Error         string           `json:"error,omitempty"`
+}
+
+type nvmeListJSON struct {
+	Devices []struct {
+		DevicePath   string `json:"DevicePath"`
+		ModelNumber  string `json:"ModelNumber"`
+		SerialNumber string `json:"SerialNumber"`
+		PhysicalSize int64  `json:"PhysicalSize"`
+	} `json:"Devices"`
+}
+
+var (
+	nvmeFormatDeviceRE     = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
+	nvmeLBAFCompactLineRE  = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
+	nvmeLBAFVerboseLineRE  = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
+	nvmeCommandContext     = exec.CommandContext
+	nvmeListFormatsTimeout = 20 * time.Second
+)
+
+func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
+	ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
+	defer cancel()
+	out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
+	if err != nil {
+		return nil, err
+	}
+	var root nvmeListJSON
+	if err := json.Unmarshal(out, &root); err != nil {
+		return nil, err
+	}
+	disks := make([]nvmeFormatDisk, 0, len(root.Devices))
+	seen := map[string]struct{}{}
+	for _, dev := range root.Devices {
+		path := strings.TrimSpace(dev.DevicePath)
+		if !nvmeFormatDeviceRE.MatchString(path) {
+			continue
+		}
+		if _, ok := seen[path]; ok {
+			continue
+		}
+		seen[path] = struct{}{}
+		disk := nvmeFormatDisk{
+			Device:      path,
+			Model:       strings.TrimSpace(dev.ModelNumber),
+			Serial:      strings.TrimSpace(dev.SerialNumber),
+			Size:        formatNVMeBytes(dev.PhysicalSize),
+			CurrentMode: -1,
+		}
+		modes, parseErr := readNVMeFormatModes(ctx, path)
+		if parseErr != nil {
+			disk.Error = parseErr.Error()
+		}
+		disk.Modes = modes
+		for _, mode := range modes {
+			if mode.InUse {
+				disk.CurrentMode = mode.Mode
+				disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
+				break
+			}
+		}
+		disks = append(disks, disk)
+	}
+	sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
+	return disks, nil
+}
+
+func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
+	if !nvmeFormatDeviceRE.MatchString(device) {
+		return nil, fmt.Errorf("invalid NVMe device")
+	}
+	out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
+	if err != nil {
+		msg := strings.TrimSpace(string(out))
+		if msg == "" {
+			msg = err.Error()
+		}
+		return nil, fmt.Errorf("%s", msg)
+	}
+	modes := parseNVMeFormatModes(string(out))
+	if len(modes) == 0 {
+		return nil, fmt.Errorf("no LBA format modes found")
+	}
+	return modes, nil
+}
+
+func parseNVMeFormatModes(raw string) []nvmeFormatMode {
+	byMode := map[int]nvmeFormatMode{}
+	for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
+		mode, errMode := strconv.Atoi(m[1])
+		metadata, errMS := strconv.ParseInt(m[2], 10, 64)
+		lbads, errLBADS := strconv.Atoi(m[3])
+		if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
+			continue
+		}
+		data := int64(1) << lbads
+		line := m[0]
+		byMode[mode] = nvmeFormatMode{
+			Mode:          mode,
+			DataBytes:     data,
+			MetadataBytes: metadata,
+			InUse:         strings.Contains(strings.ToLower(line), "in use"),
+			Label:         fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
+		}
+	}
+	for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
+		mode, errMode := strconv.Atoi(m[1])
+		metadata, errMS := strconv.ParseInt(m[2], 10, 64)
+		data, errData := strconv.ParseInt(m[3], 10, 64)
+		if errMode != nil || errMS != nil || errData != nil || data <= 0 {
+			continue
+		}
+		line := m[0]
+		byMode[mode] = nvmeFormatMode{
+			Mode:          mode,
+			DataBytes:     data,
+			MetadataBytes: metadata,
+			InUse:         strings.Contains(strings.ToLower(line), "in use"),
+			Label:         fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
+		}
+	}
+	modes := make([]nvmeFormatMode, 0, len(byMode))
+	for _, mode := range byMode {
+		modes = append(modes, mode)
+	}
+	sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
+	return modes
+}
+
+func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
+	if !nvmeFormatDeviceRE.MatchString(device) {
+		return fmt.Errorf("invalid NVMe device")
+	}
+	modes, err := readNVMeFormatModes(ctx, device)
+	if err != nil {
+		return err
+	}
+	var selected nvmeFormatMode
+	found := false
+	for _, mode := range modes {
+		if mode.Mode == lbaf {
+			selected = mode
+			found = true
+			break
+		}
+	}
+	if !found {
+		return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
+	}
+	ms := 0
+	if selected.MetadataBytes > 0 {
+		ms = 1
+	}
+	j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
+	cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
+	return streamCmdJob(j, cmd)
+}
+
+func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
+	disks, err := listNVMeFormatDisks(r.Context())
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	writeJSON(w, disks)
+}
+
+func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
+	var req struct {
+		Device string `json:"device"`
+		LBAF   int    `json:"lbaf"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	if !nvmeFormatDeviceRE.MatchString(req.Device) {
+		writeError(w, http.StatusBadRequest, "invalid NVMe device")
+		return
+	}
+	disks, err := listNVMeFormatDisks(r.Context())
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	var label string
+	allowed := false
+	for _, disk := range disks {
+		if disk.Device != req.Device {
+			continue
+		}
+		for _, mode := range disk.Modes {
+			if mode.Mode == req.LBAF {
+				allowed = true
+				label = mode.Label
+				break
+			}
+		}
+	}
+	if !allowed {
+		writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
+		return
+	}
+	name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
+	t := &Task{
+		ID:        newJobID("nvme-format"),
+		Name:      name,
+		Target:    "nvme-format",
+		Priority:  defaultTaskPriority("nvme-format", taskParams{}),
+		Status:    TaskPending,
+		CreatedAt: time.Now(),
+		params: taskParams{
+			Device: req.Device,
+			LBAF:   req.LBAF,
+		},
+	}
+	globalQueue.enqueue(t)
+	writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+}
+
+func formatNVMeBlock(dataBytes, metadataBytes int64) string {
+	return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
+}
+
+func formatNVMeBytes(n int64) string {
+	if n <= 0 {
+		return ""
+	}
+	units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
+	v := float64(n)
+	unit := 0
+	for v >= 1000 && unit < len(units)-1 {
+		v /= 1000
+		unit++
+	}
+	if unit == 0 {
+		return fmt.Sprintf("%d B", n)
+	}
+	return fmt.Sprintf("%.1f %s", v, units[unit])
+}
+
+func renderNVMeFormatInline() string {
+	return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
+<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<script>
+function nvmeFormatEsc(s) {
+  return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
+    return {'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[c];
+  });
+}
+function loadNVMeFormats() {
+  var status = document.getElementById('nvme-format-status');
+  var table = document.getElementById('nvme-format-table');
+  status.textContent = 'Loading NVMe disks...';
+  status.style.color = 'var(--muted)';
+  table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
+  fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
+    window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
+    if (!window._nvmeFormatDisks.length) {
+      status.textContent = 'No NVMe disks found.';
+      table.innerHTML = '';
+      return;
+    }
+    status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
+    var rows = window._nvmeFormatDisks.map(function(d, idx) {
+      var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
+      var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
+      var options = (d.modes || []).map(function(m) {
+        return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
+      }).join('');
+      var disabled = options ? '' : ' disabled';
+      var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
+      return '<tr>'
+        + '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
+        + '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
+        + '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
+        + '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
+        + '</tr>';
+    }).join('');
+    table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
+  }).catch(function(e) {
+    status.textContent = 'Error loading NVMe disks: ' + e.message;
+    status.style.color = 'var(--crit-fg,#9f3a38)';
+    table.innerHTML = '';
+  });
+}
+function nvmeWaitTaskDone(taskID, rowMsg) {
+  var timer = setInterval(function() {
+    fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
+      var task = (tasks || []).find(function(t) { return t.id === taskID; });
+      if (!task) return;
+      if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
+        clearInterval(timer);
+        rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
+        rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
+        loadNVMeFormats();
+      }
+    }).catch(function(){});
+  }, 1500);
+}
+function nvmeFormatRun(idx, btn) {
+  var disk = (window._nvmeFormatDisks || [])[idx];
+  var select = document.getElementById('nvme-format-select-' + idx);
+  var row = btn.closest('td');
+  var rowMsg = row.querySelector('.nvme-format-row-msg');
+  if (!disk || !select) return;
+  var lbaf = parseInt(select.value, 10);
+  var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
+  if (!mode) return;
+  if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
+  btn.disabled = true;
+  rowMsg.style.color = 'var(--muted)';
+  rowMsg.textContent = 'Queued...';
+  fetch('/api/tools/nvme-format/run', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body:JSON.stringify({device: disk.device, lbaf: lbaf})
+  }).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
+    rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
+    nvmeWaitTaskDone(d.task_id, rowMsg);
+  }).catch(function(e) {
+    rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
+    rowMsg.textContent = 'Error: ' + e.message;
+  }).finally(function() {
+    btn.disabled = false;
+  });
+}
+loadNVMeFormats();
+</script>`
+}
+
+func renderNVMeFormatCard() string {
+	return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">&#8635; Refresh</button></div><div class="card-body">` +
+		`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
+		renderNVMeFormatInline() + `</div></div>`
+}
--- a/audit/internal/webui/page_benchmark.go
+++ b/audit/internal/webui/page_benchmark.go
@@ -0,0 +1,630 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+type benchmarkHistoryRun struct {
+	generatedAt   time.Time
+	displayTime   string
+	gpuScores     map[int]float64
+	gpuStatuses   map[int]string
+	overallStatus string
+}
+
+func renderBenchmark(opts HandlerOptions) string {
+	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+<div class="grid2">
+  <div class="card">
+    <div class="card-head">Benchmark Setup</div>
+    <div class="card-body">
+      <div class="form-row">
+        <label>Profile</label>
+        <select id="benchmark-profile">
+          <option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
+          <option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
+          <option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
+        </select>
+      </div>
+      <div class="form-row">
+        <label>GPU Selection</label>
+        <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
+          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
+        </div>
+        <div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+          <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+        </div>
+      </div>
+      <label class="benchmark-cb-row">
+        <input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
+        <span>Sequential — one GPU at a time</span>
+      </label>
+      <label class="benchmark-cb-row" id="benchmark-parallel-label">
+        <input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
+        <span>Parallel — all selected GPUs simultaneously</span>
+      </label>
+      <label class="benchmark-cb-row" id="benchmark-ramp-label">
+        <input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
+        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
+      </label>
+      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
+      <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
+        <button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
+        <button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
+        <button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
+      </div>
+      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
+      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
+      <div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
+      <div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
+    </div>
+  </div>
+
+  <div class="card">
+    <div class="card-head">Method Split</div>
+    <div class="card-body">
+      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
+      <table>
+        <tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
+        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
+        <tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
+      </table>
+      <p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
+    </div>
+  </div>
+</div>
+
+` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
+
+<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
+  <div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
+</div>
+
+<style>
+.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
+.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+</style>
+
+<script>
+let benchmarkES = null;
+function benchmarkTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+function benchmarkSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function benchmarkMode() {
+  const el = document.querySelector('input[name="benchmark-mode"]:checked');
+  return el ? el.value : 'sequential';
+}
+function benchmarkUpdateSelectionNote() {
+  const selected = benchmarkSelectedGPUIndices();
+  const perfBtn = document.getElementById('benchmark-run-performance-btn');
+  const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
+  const note = document.getElementById('benchmark-selection-note');
+  if (!selected.length) {
+    perfBtn.disabled = true;
+    fitBtn.disabled = true;
+    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
+    return;
+  }
+  perfBtn.disabled = false;
+  fitBtn.disabled = false;
+  const mode = benchmarkMode();
+  if (mode === 'ramp-up') {
+    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
+  } else if (mode === 'parallel') {
+    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
+  } else {
+    note.textContent = 'Sequential: each selected GPU benchmarked separately.';
+  }
+}
+function benchmarkRenderGPUList(gpus) {
+  const root = document.getElementById('benchmark-gpu-list');
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    benchmarkUpdateSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="benchmark-gpu-row">'
+      + '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  benchmarkApplyMultiGPUState(gpus.length);
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkApplyMultiGPUState(gpuCount) {
+  var multiValues = ['parallel', 'ramp-up'];
+  var radios = document.querySelectorAll('input[name="benchmark-mode"]');
+  radios.forEach(function(el) {
+    var isMulti = multiValues.indexOf(el.value) >= 0;
+    if (gpuCount < 2 && isMulti) {
+      el.disabled = true;
+      if (el.checked) {
+        var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
+        if (seq) seq.checked = true;
+      }
+      var label = el.closest('label');
+      if (label) label.style.opacity = '0.4';
+    } else {
+      el.disabled = false;
+      if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
+      var label = el.closest('label');
+      if (label) label.style.opacity = '';
+    }
+  });
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkLoadGPUs() {
+  const status = document.getElementById('benchmark-run-status');
+  status.textContent = '';
+  fetch('/api/gpu/nvidia').then(function(r) {
+    return r.json().then(function(body) {
+      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+      return body;
+    });
+  }).then(function(gpus) {
+    benchmarkRenderGPUList(gpus);
+  }).catch(function(err) {
+    document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    benchmarkUpdateSelectionNote();
+  });
+}
+function benchmarkSelectAll() {
+  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkSelectNone() {
+  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
+  benchmarkUpdateSelectionNote();
+}
+function runNvidiaBenchmark(kind) {
+  const selected = benchmarkSelectedGPUIndices();
+  const status = document.getElementById('benchmark-run-status');
+  if (!selected.length) {
+    status.textContent = 'Select at least one GPU.';
+    return;
+  }
+  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
+  const mode = benchmarkMode();
+  const rampUp = mode === 'ramp-up' && selected.length > 1;
+  const parallelGPUs = mode === 'parallel' && kind === 'performance';
+  if (kind === 'power-fit' && mode === 'parallel') {
+    status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
+    return;
+  }
+  const body = {
+    profile: document.getElementById('benchmark-profile').value || 'standard',
+    gpu_indices: selected,
+    run_nccl: kind === 'performance' && selected.length > 1,
+    parallel_gpus: parallelGPUs,
+    ramp_up: rampUp,
+    display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
+  };
+  document.getElementById('benchmark-output').style.display = 'block';
+  document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
+  const term = document.getElementById('benchmark-terminal');
+  term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
+  status.textContent = 'Queueing...';
+  const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
+  fetch(endpoint, {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(body)
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  }).then(function(d) {
+    const taskIds = benchmarkTaskIDs(d);
+    if (!taskIds.length) throw new Error('No benchmark task was queued.');
+    status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
+    const streamNext = function(idx, failures) {
+      if (idx >= taskIds.length) {
+        status.textContent = failures ? 'Completed with failures.' : 'Completed.';
+        return;
+      }
+      const taskId = taskIds[idx];
+      term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
+      benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
+      benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      benchmarkES.addEventListener('done', function(e) {
+        benchmarkES.close();
+        benchmarkES = null;
+        if (e.data) failures += 1;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        const isLast = (idx + 1 >= taskIds.length);
+        streamNext(idx + 1, failures);
+        if (isLast) { benchmarkRefreshResults(); }
+      });
+      benchmarkES.onerror = function() {
+        if (benchmarkES) {
+          benchmarkES.close();
+          benchmarkES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        streamNext(idx + 1, failures + 1);
+      };
+    };
+    streamNext(0, 0);
+  }).catch(function(err) {
+    status.textContent = 'Error.';
+    term.textContent += 'ERROR: ' + err.message + '\n';
+  });
+}
+function benchmarkRenderAutotuneStatus(payload) {
+  const el = document.getElementById('benchmark-autotune-status');
+  if (!el) return;
+  if (!payload || !payload.configured || !payload.config) {
+    el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
+    return;
+  }
+  const cfg = payload.config || {};
+  const decision = payload.decision || {};
+  const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
+  const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
+  const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
+  const mode = decision.mode ? (' · mode ' + decision.mode) : '';
+  el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
+}
+function loadBenchmarkAutotuneStatus() {
+  fetch('/api/bee-bench/nvidia/autotune/status')
+    .then(function(r) {
+      return r.json().then(function(body) {
+        if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+        return body;
+      });
+    })
+    .then(function(body) { benchmarkRenderAutotuneStatus(body); })
+    .catch(function(err) {
+      const el = document.getElementById('benchmark-autotune-status');
+      if (el) el.textContent = 'Autotune status error: ' + err.message;
+    });
+}
+function runBenchmarkAutotune() {
+  const selected = benchmarkSelectedGPUIndices();
+  const status = document.getElementById('benchmark-run-status');
+  const term = document.getElementById('benchmark-terminal');
+  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
+  document.getElementById('benchmark-output').style.display = 'block';
+  document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
+  term.textContent = 'Enqueuing benchmark autotune...\n';
+  status.textContent = 'Queueing autotune...';
+  fetch('/api/bee-bench/nvidia/autotune/run', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify({
+      profile: document.getElementById('benchmark-profile').value || 'standard',
+      benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
+      gpu_indices: selected
+    })
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  }).then(function(d) {
+    const taskIds = benchmarkTaskIDs(d);
+    if (!taskIds.length) throw new Error('No autotune task was queued.');
+    const taskId = taskIds[0];
+    status.textContent = 'Autotune queued: ' + taskId;
+    benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
+    benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    benchmarkES.addEventListener('done', function(e) {
+      if (benchmarkES) {
+        benchmarkES.close();
+        benchmarkES = null;
+      }
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
+      loadBenchmarkAutotuneStatus();
+    });
+  }).catch(function(err) {
+    status.textContent = 'Autotune error.';
+    term.textContent += 'ERROR: ' + err.message + '\n';
+  });
+}
+benchmarkLoadGPUs();
+loadBenchmarkAutotuneStatus();
+function benchmarkRefreshResults() {
+  fetch('/api/benchmark/results')
+    .then(function(r) { return r.text(); })
+    .then(function(html) {
+      const el = document.getElementById('benchmark-results-section');
+      if (el) el.innerHTML = html;
+    })
+    .catch(function() {});
+}
+</script>`
+}
+
+func renderBenchmarkResultsCard(exportDir string) string {
+	maxIdx, runs := loadBenchmarkHistory(exportDir)
+	perf := renderBenchmarkResultsCardFromRuns(
+		"Perf Results",
+		"Composite score by saved benchmark run and GPU.",
+		"No saved performance benchmark runs yet.",
+		maxIdx,
+		runs,
+	)
+	power := renderPowerBenchmarkResultsCard(exportDir)
+	return perf + "\n" + power
+}
+
+func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
+	if len(runs) == 0 {
+		return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
+	}
+	var b strings.Builder
+	b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
+	if strings.TrimSpace(description) != "" {
+		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
+	}
+	b.WriteString(`<div style="overflow-x:auto">`)
+	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
+	for i := 0; i <= maxGPUIndex; i++ {
+		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
+	}
+	b.WriteString(`</tr></thead><tbody>`)
+	for i, run := range runs {
+		b.WriteString(`<tr>`)
+		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+		overallColor := "var(--ok)"
+		overallLabel := run.overallStatus
+		if overallLabel == "" {
+			overallLabel = "OK"
+		}
+		if overallLabel == "FAILED" {
+			overallColor = "var(--crit-fg,#9f3a38)"
+		} else if overallLabel != "OK" {
+			overallColor = "var(--warn)"
+		}
+		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
+		for idx := 0; idx <= maxGPUIndex; idx++ {
+			score, ok := run.gpuScores[idx]
+			if !ok {
+				b.WriteString(`<td style="color:var(--muted)">-</td>`)
+				continue
+			}
+			gpuStatus := run.gpuStatuses[idx]
+			scoreColor := ""
+			switch gpuStatus {
+			case "FAILED":
+				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
+			case "WARNING", "PARTIAL":
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			case "", "OK":
+			default:
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			}
+			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
+		}
+		b.WriteString(`</tr>`)
+	}
+	b.WriteString(`</tbody></table></div></div></div>`)
+	return b.String()
+}
+
+func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
+	baseDir := app.DefaultBeeBenchPerfDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-bench", "perf")
+	}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return -1, nil
+	}
+	sort.Strings(paths)
+	return loadBenchmarkHistoryFromPaths(paths)
+}
+
+func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
+	runs := make([]benchmarkHistoryRun, 0, len(paths))
+	maxGPUIndex := -1
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var result platform.NvidiaBenchmarkResult
+		if err := json.Unmarshal(raw, &result); err != nil {
+			continue
+		}
+		run := benchmarkHistoryRun{
+			generatedAt:   result.GeneratedAt,
+			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			gpuScores:     make(map[int]float64),
+			gpuStatuses:   make(map[int]string),
+			overallStatus: result.OverallStatus,
+		}
+		for _, gpu := range result.GPUs {
+			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
+			run.gpuStatuses[gpu.Index] = gpu.Status
+			if gpu.Index > maxGPUIndex {
+				maxGPUIndex = gpu.Index
+			}
+		}
+		runs = append(runs, run)
+	}
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+	return maxGPUIndex, runs
+}
+
+func renderPowerBenchmarkResultsCard(exportDir string) string {
+	baseDir := app.DefaultBeeBenchPowerDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-bench", "power")
+	}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
+	}
+	sort.Strings(paths)
+
+	type powerRun struct {
+		generatedAt time.Time
+		displayTime string
+		result      platform.NvidiaPowerBenchResult
+	}
+	var runs []powerRun
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var r platform.NvidiaPowerBenchResult
+		if err := json.Unmarshal(raw, &r); err != nil {
+			continue
+		}
+		runs = append(runs, powerRun{
+			generatedAt: r.GeneratedAt,
+			displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			result:      r,
+		})
+	}
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+
+	var b strings.Builder
+	b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
+
+	latest := runs[0].result
+	b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
+	if latest.Hostname != "" {
+		b.WriteString(` — ` + html.EscapeString(latest.Hostname))
+	}
+	if latest.OverallStatus != "" {
+		statusColor := "var(--ok)"
+		if latest.OverallStatus != "OK" {
+			statusColor = "var(--warn)"
+		}
+		b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
+	}
+	b.WriteString(`</p>`)
+
+	if len(latest.GPUs) > 0 {
+		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`</tr></thead><tbody>`)
+		for _, gpu := range latest.GPUs {
+			finalLimitW := gpu.StablePowerLimitW
+			if finalLimitW <= 0 {
+				finalLimitW = gpu.AppliedPowerLimitW
+			}
+			derated := gpu.Derated ||
+				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
+			rowStyle := ""
+			finalStyle := ""
+			if derated {
+				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
+				finalStyle = ` style="color:#e6a000;font-weight:600"`
+			}
+			statusLabel := gpu.Status
+			if statusLabel == "" {
+				statusLabel = "OK"
+			}
+			statusColor := "var(--ok)"
+			if statusLabel == "FAILED" {
+				statusColor = "var(--crit-fg,#9f3a38)"
+			} else if statusLabel != "OK" {
+				statusColor = "var(--warn)"
+			}
+			nominalStr := "-"
+			if gpu.DefaultPowerLimitW > 0 {
+				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
+			}
+			singleStr := "-"
+			if gpu.AppliedPowerLimitW > 0 {
+				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+			}
+			multiStr := "-"
+			if gpu.StablePowerLimitW > 0 {
+				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
+			}
+			p95Str := "-"
+			if gpu.MaxObservedPowerW > 0 {
+				p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
+			}
+			b.WriteString(`<tr` + rowStyle + `>`)
+			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
+			b.WriteString(`<td>` + nominalStr + `</td>`)
+			b.WriteString(`<td>` + singleStr + `</td>`)
+			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
+			b.WriteString(`<td>` + p95Str + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div>`)
+	}
+
+	if len(runs) > 1 {
+		b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
+		b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
+		for i, run := range runs {
+			statusColor := "var(--ok)"
+			if run.result.OverallStatus != "OK" {
+				statusColor = "var(--warn)"
+			}
+			b.WriteString(`<tr>`)
+			b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+			b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div></details>`)
+	}
+
+	b.WriteString(`</div></div>`)
+	return b.String()
+}
+
+// renderSpeed renders the Speed page (step 4): performance benchmarks.
+// Uses the same benchmark infrastructure; defaults to Standard profile (throughput/bandwidth).
+// For long-duration stability/overnight runs, see Endurance (step 5).
+func renderSpeed(opts HandlerOptions) string {
+	base := renderBenchmark(opts)
+	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Speed:</strong> Measures GPU compute throughput and memory bandwidth. For overnight stability testing, go to <a href="/endurance">5. Endurance</a>.</div>` + base
+}
+
+// renderEndurance renders the Endurance page (step 5): long-duration reliability tests.
+// Focuses on Stability and Overnight profiles for multi-hour burn validation.
+// For short load tests, see Load (step 3). For throughput measurement, see Speed (step 4).
+func renderEndurance(opts HandlerOptions) string {
+	base := renderBenchmark(opts)
+	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>Endurance:</strong> Long-duration reliability tests — Stability (several hours) and Overnight (8+ h) profiles. These profiles run hardware at sustained load; results show whether the server holds its performance envelope over time.</div>
+<div class="alert alert-info" style="margin-bottom:16px">Use the <strong>Stability</strong> or <strong>Overnight</strong> profile in the setup card below. The Standard profile is available too but is better suited for the <a href="/speed">4. Speed</a> page.</div>` + base
+}
--- a/audit/internal/webui/page_burn.go
+++ b/audit/internal/webui/page_burn.go
@@ -0,0 +1,388 @@
+package webui
+
+// renderLoad renders the Load page (step 3): sustained stress tests.
+// For non-destructive status checks, see Check (step 2).
+// For DCGM targeted diagnostics (targeted_stress, targeted_power, pulse), see Check → Validate mode.
+func renderLoad() string { return renderBurn() }
+
+func renderBurn() string {
+	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Load runs sustained GPU compute and CPU/memory stress recipes. DCGM diagnostics (<code>targeted_stress</code>, <code>targeted_power</code>, <code>pulse_test</code>) and NCCL/NVBandwidth are on the <a href="/check">2. Check</a> page. For overnight endurance runs, see <a href="/endurance">5. Endurance</a>.</div>
+<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Burn Profile</div>
+  <div class="card-body burn-profile-body">
+    <div class="burn-profile-col">
+      <div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
+    </div>
+    <div class="burn-profile-col burn-profile-action">
+      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
+      <p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
+    </div>
+    <div class="burn-profile-col burn-profile-action">
+      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
+      <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
+    </div>
+  </div>
+  <div class="card-body" style="padding-top:0;display:flex;justify-content:center">
+    <span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">NVIDIA GPU Selection</div>
+  <div class="card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
+    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
+      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
+    </div>
+	    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+	      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+	    </div>
+	    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
+	    <div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
+	      <label class="cb-row">
+	        <input type="radio" name="burn-nvidia-mode" value="sequential" checked>
+	        <span>Sequential — selected GPUs one at a time</span>
+	      </label>
+	      <label class="cb-row" id="burn-parallel-label">
+	        <input type="radio" name="burn-nvidia-mode" value="parallel">
+	        <span>Parallel — all selected GPUs simultaneously</span>
+	      </label>
+	      <label class="cb-row" id="burn-ramp-label">
+	        <input type="radio" name="burn-nvidia-mode" value="ramp-up">
+	        <span>Ramp-up — add one GPU at a time</span>
+	      </label>
+	    </div>
+	  </div>
+	</div>
+
+<div class="burn-section">Core Burn Paths</div>
+<div class="grid2 burn-grid" style="margin-bottom:16px">
+<div class="card burn-card">
+  <div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
+  <div class="card-body burn-card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
+    <label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
+  </div>
+</div>
+
+<div class="card burn-card">
+  <div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
+  <div class="card-body burn-card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
+    <label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
+  </div>
+</div>
+</div>
+
+<div id="bi-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Output <span id="bi-title"></span></div>
+  <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
+</div>
+
+<style>
+.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
+.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
+.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
+.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
+.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
+.burn-profile-col { min-width:0; }
+.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
+.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
+.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
+.burn-grid { align-items:stretch; }
+.burn-card { height:100%; display:flex; flex-direction:column; }
+.burn-card-body { flex:1; display:flex; flex-direction:column; }
+.card-head-actions { justify-content:space-between; }
+.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
+@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
+</style>
+
+<script>
+let biES = null;
+function burnTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+function burnProfile() {
+  const selected = document.querySelector('input[name="burn-profile"]:checked');
+  return selected ? selected.value : 'smoke';
+}
+function burnSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function burnNvidiaMode() {
+  const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
+  return el ? el.value : 'sequential';
+}
+function burnApplyMultiGPUState(gpuCount) {
+  var multiValues = ['parallel', 'ramp-up'];
+  var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
+  radios.forEach(function(el) {
+    var isMulti = multiValues.indexOf(el.value) >= 0;
+    if (gpuCount < 2 && isMulti) {
+      el.disabled = true;
+      if (el.checked) {
+        var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
+        if (seq) seq.checked = true;
+      }
+      var label = el.closest('label');
+      if (label) label.style.opacity = '0.4';
+    } else {
+      el.disabled = false;
+      var label = el.closest('label');
+      if (label) label.style.opacity = '';
+    }
+  });
+}
+function burnUpdateSelectionNote() {
+  const note = document.getElementById('burn-selection-note');
+  const selected = burnSelectedGPUIndices();
+  if (!selected.length) {
+    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
+    return;
+  }
+  note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
+}
+function burnRenderGPUList(gpus) {
+  const root = document.getElementById('burn-gpu-list');
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    burnUpdateSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="burn-gpu-row">'
+      + '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  burnApplyMultiGPUState(gpus.length);
+  burnUpdateSelectionNote();
+}
+function burnSelectAll() {
+  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
+  burnUpdateSelectionNote();
+}
+function burnSelectNone() {
+  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
+  burnUpdateSelectionNote();
+}
+function burnLoadGPUs() {
+  fetch('/api/gpu/nvidia').then(function(r) {
+    return r.json().then(function(body) {
+      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+      return body;
+    });
+  }).then(function(gpus) {
+    burnRenderGPUList(gpus);
+  }).catch(function(err) {
+    document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    burnUpdateSelectionNote();
+  });
+}
+function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
+  const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
+  if (useSelectedNvidia) {
+    const selected = burnSelectedGPUIndices();
+    if (!selected.length) {
+      return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
+    }
+    body.gpu_indices = selected;
+    const bMode = burnNvidiaMode();
+    if (bMode === 'ramp-up' && selected.length > 1) {
+      body.stagger_gpu_start = true;
+    } else if (bMode === 'parallel' && selected.length > 1) {
+      body.parallel_gpus = true;
+    }
+  }
+  return fetch('/api/sat/' + target + '/run', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(body)
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  });
+}
+function streamTask(taskId, label) {
+  if (biES) { biES.close(); biES = null; }
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
+  const term = document.getElementById('bi-terminal');
+  term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
+  biES = new EventSource('/api/tasks/' + taskId + '/stream');
+  biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+  biES.addEventListener('done', function(e) {
+    biES.close();
+    biES = null;
+    term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+    term.scrollTop = term.scrollHeight;
+  });
+}
+function streamBurnTask(taskId, label, resetTerminal) {
+  return streamBurnTaskSet([taskId], label, resetTerminal);
+}
+function streamBurnTaskSet(taskIds, label, resetTerminal) {
+  if (biES) { biES.close(); biES = null; }
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
+  const term = document.getElementById('bi-terminal');
+  if (resetTerminal) {
+    term.textContent = '';
+  }
+  if (!Array.isArray(taskIds) || !taskIds.length) {
+    term.textContent += 'ERROR: no tasks queued.\n';
+    return Promise.resolve({ok:false, error:'no tasks queued'});
+  }
+  const streamNext = function(idx, failures) {
+    if (idx >= taskIds.length) {
+      return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
+    }
+    const taskId = taskIds[idx];
+    term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
+    return new Promise(function(resolve) {
+      biES = new EventSource('/api/tasks/' + taskId + '/stream');
+      biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      biES.addEventListener('done', function(e) {
+        biES.close();
+        biES = null;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + (e.data ? 1 : 0));
+      });
+      biES.onerror = function() {
+        if (biES) {
+          biES.close();
+          biES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + 1);
+      };
+    }).then(function(nextFailures) {
+      return streamNext(idx + 1, nextFailures);
+    });
+  };
+  return streamNext(0, 0);
+}
+function runBurnTaskSet(tasks, statusElId) {
+  const enabled = tasks.filter(function(t) {
+    const el = document.getElementById(t.id);
+    return el && el.checked && !el.disabled;
+  });
+  const status = statusElId ? document.getElementById(statusElId) : null;
+  if (status) status.textContent = '';
+  if (!enabled.length) {
+    if (status) status.textContent = 'No tasks selected.';
+    return;
+  }
+  const term = document.getElementById('bi-terminal');
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
+  term.textContent = '';
+  const runNext = function(idx) {
+    if (idx >= enabled.length) {
+      if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
+      return Promise.resolve();
+    }
+    const t = enabled[idx];
+    term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
+    if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
+    return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
+      .then(function(d) {
+        return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
+      })
+      .then(function() {
+        return runNext(idx + 1);
+      })
+      .catch(function(err) {
+        if (status) status.textContent = 'Error: ' + err.message;
+        document.getElementById('bi-output').style.display = 'block';
+        term.textContent += 'ERROR: ' + err.message + '\n';
+        return Promise.reject(err);
+      });
+  };
+  return runNext(0);
+}
+function runPlatformStress() {
+  const comps = [];
+  const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
+  const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
+  const hasChecked = function(ids) {
+    return ids.some(function(id) {
+      const el = document.getElementById(id);
+      return el && el.checked && !el.disabled;
+    });
+  };
+  if (hasChecked(computeIDs)) comps.push('cpu');
+  if (hasChecked(gpuIDs)) comps.push('gpu');
+  if (!comps.length) {
+    const status = document.getElementById('burn-all-status');
+    if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
+    return;
+  }
+  const extra = comps.length > 0 ? {platform_components: comps} : {};
+  enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
+    streamTask(d.task_id, 'Platform Thermal Cycling');
+  });
+}
+function runAllBurnTasks() {
+  const status = document.getElementById('burn-all-status');
+  const all = [
+    {id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
+    {id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
+    {id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
+    {id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
+    {id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
+    {id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
+    {id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
+  ];
+  status.textContent = 'Enqueuing...';
+  runBurnTaskSet(all, 'burn-all-status');
+}
+fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
+  const map = {
+    'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
+    'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
+    'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
+    'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
+  };
+  tools.forEach(function(t) {
+    const spec = map[t.id];
+    if (!spec) return;
+    const cb = document.getElementById(spec.cb);
+    const note = document.getElementById(spec.note);
+    if (!cb) return;
+    if (t.available) {
+      cb.disabled = false;
+    } else if (note) {
+      note.textContent = '— ' + spec.reason;
+    }
+  });
+}).catch(function() {});
+burnLoadGPUs();
+</script>`
+}
--- a/audit/internal/webui/page_export_tools.go
+++ b/audit/internal/webui/page_export_tools.go
@@ -0,0 +1,513 @@
+package webui
+
+import (
+	"fmt"
+	"html"
+	"net/url"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+func renderExport(exportDir string) string {
+	entries, _ := listExportFiles(exportDir)
+	var rows strings.Builder
+	for _, e := range entries {
+		rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
+			url.QueryEscape(e), html.EscapeString(e)))
+	}
+	if len(entries) == 0 {
+		rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
+	}
+	return `<div class="grid2">
+<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
+<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
+` + renderSupportBundleInline() + `
+</div></div>
+<div class="card"><div class="card-head">Export Files</div><div class="card-body">
+<table><tr><th>File</th></tr>` + rows.String() + `</table>
+</div></div>
+</div>
+
+` + renderUSBExportCard()
+}
+
+func listExportFiles(exportDir string) ([]string, error) {
+	var entries []string
+	err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if info.IsDir() {
+			return nil
+		}
+		rel, err := filepath.Rel(exportDir, path)
+		if err != nil {
+			return err
+		}
+		entries = append(entries, rel)
+		return nil
+	})
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+	sort.Strings(entries)
+	return entries, nil
+}
+
+func renderSupportBundleInline() string {
+	return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">&#8595; Download Support Bundle</button>
+<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
+<script>
+window.supportBundleDownload = function() {
+  var btn = document.getElementById('support-bundle-btn');
+  var status = document.getElementById('support-bundle-status');
+  btn.disabled = true;
+  btn.textContent = 'Building...';
+  status.textContent = 'Collecting logs and export data\u2026';
+  status.style.color = 'var(--muted)';
+  var filename = 'bee-support.tar.gz';
+  fetch('/export/support.tar.gz')
+    .then(function(r) {
+      if (!r.ok) throw new Error('HTTP ' + r.status);
+      var cd = r.headers.get('Content-Disposition') || '';
+      var m = cd.match(/filename="?([^";]+)"?/);
+      if (m) filename = m[1];
+      return r.blob();
+    })
+    .then(function(blob) {
+      var url = URL.createObjectURL(blob);
+      var a = document.createElement('a');
+      a.href = url;
+      a.download = filename;
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+      status.textContent = 'Download started.';
+      status.style.color = 'var(--ok-fg)';
+    })
+    .catch(function(e) {
+      status.textContent = 'Error: ' + e.message;
+      status.style.color = 'var(--crit-fg)';
+    })
+    .finally(function() {
+      btn.disabled = false;
+      btn.textContent = '\u2195 Download Support Bundle';
+    });
+};
+</script>`
+}
+
+func renderUSBExportCard() string {
+	return `<div class="card" style="margin-top:16px">
+  <div class="card-head">USB Black-Box
+    <button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">&#8635; Refresh</button>
+  </div>
+  <div class="card-body">` + renderUSBExportInline() + `</div>
+</div>`
+}
+
+func renderUSBExportInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
+<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
+<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
+<div id="usb-targets" style="margin-top:12px"></div>
+<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
+<script>
+(function(){
+function blackboxRefresh() {
+  document.getElementById('usb-status').textContent = 'Scanning...';
+  document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
+  document.getElementById('usb-targets').innerHTML = '';
+  document.getElementById('usb-msg').textContent = '';
+  Promise.all([
+    fetch('/api/export/usb').then(r=>r.json()),
+    fetch('/api/blackbox/status').then(r=>r.json())
+  ]).then(function(values) {
+    const targets = Array.isArray(values[0]) ? values[0] : [];
+    const state = values[1] || {};
+    const active = Array.isArray(state.targets) ? state.targets : [];
+    window._usbTargets = targets;
+    window._blackboxTargets = active;
+    const st = document.getElementById('usb-status');
+    const ct = document.getElementById('usb-targets');
+    const summary = document.getElementById('blackbox-summary');
+    if (state.boot_folder) {
+      summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
+    } else {
+      summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
+    }
+    if (!targets || targets.length === 0) {
+      st.textContent = 'No removable USB devices found.';
+    } else {
+      st.textContent = targets.length + ' device(s) found:';
+    }
+    const byDevice = {};
+    active.forEach(function(item) { byDevice[item.device] = item; });
+    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
+      targets.map((t, idx) => {
+        const dev = t.device || '';
+        const label = t.label || '';
+        const model = t.model || '';
+        const state = byDevice[dev];
+        const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
+        const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
+        return '<tr>' +
+          '<td style="font-family:monospace">'+dev+'</td>' +
+          '<td>'+t.fs_type+'</td>' +
+          '<td>'+t.size+'</td>' +
+          '<td>'+label+'</td>' +
+          '<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
+          '<td style="font-size:12px">'+status+detail+'</td>' +
+          '<td style="white-space:nowrap">' +
+            (state
+              ? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
+              : '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
+            '<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
+          '</td></tr>';
+      }).join('') + '</table>';
+  }).catch(e => {
+    document.getElementById('usb-status').textContent = 'Error: ' + e;
+  });
+}
+window.blackboxEnable = function(targetIndex, btn) {
+  const target = (window._usbTargets || [])[targetIndex];
+  if (!target) {
+    const msg = document.getElementById('usb-msg');
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: USB target not found. Refresh and try again.';
+    return;
+  }
+  const msg = document.getElementById('usb-msg');
+  const row = btn ? btn.closest('td') : null;
+  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
+  const originalText = btn ? btn.textContent : '';
+  if (btn) {
+    btn.disabled = true;
+    btn.textContent = 'Enabling...';
+  }
+  if (rowMsg) {
+    rowMsg.style.color = 'var(--muted)';
+    rowMsg.textContent = 'Working...';
+  }
+  msg.style.color = 'var(--muted)';
+  msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
+  fetch('/api/blackbox/enable', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(target)
+  }).then(async r => {
+    const d = await r.json();
+    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
+    return d;
+  }).then(d => {
+    msg.style.color = 'var(--ok,green)';
+    msg.textContent = d.message || 'Done.';
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--ok,green)';
+      rowMsg.textContent = d.message || 'Done.';
+    }
+  }).catch(e => {
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: '+e;
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--err,red)';
+      rowMsg.textContent = 'Error: ' + e;
+    }
+  }).finally(() => {
+    if (btn) {
+      btn.disabled = false;
+      btn.textContent = originalText;
+    }
+    setTimeout(blackboxRefresh, 300);
+  });
+};
+window.blackboxDisable = function(targetIndex, btn) {
+  const target = (window._usbTargets || [])[targetIndex];
+  const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
+  if (!target || !active) {
+    const msg = document.getElementById('usb-msg');
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: black-box target not found. Refresh and try again.';
+    return;
+  }
+  const msg = document.getElementById('usb-msg');
+  const row = btn ? btn.closest('td') : null;
+  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
+  const originalText = btn ? btn.textContent : '';
+  if (btn) {
+    btn.disabled = true;
+    btn.textContent = 'Disabling...';
+  }
+  if (rowMsg) {
+    rowMsg.style.color = 'var(--muted)';
+    rowMsg.textContent = 'Working...';
+  }
+  msg.style.color = 'var(--muted)';
+  msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
+  fetch('/api/blackbox/disable', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
+  }).then(async r => {
+    const d = await r.json();
+    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
+    return d;
+  }).then(d => {
+    msg.style.color = 'var(--ok,green)';
+    msg.textContent = d.message || 'Done.';
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--ok,green)';
+      rowMsg.textContent = d.message || 'Done.';
+    }
+  }).catch(e => {
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: '+e;
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--err,red)';
+      rowMsg.textContent = 'Error: ' + e;
+    }
+  }).finally(() => {
+    if (btn) {
+      btn.disabled = false;
+      btn.textContent = originalText;
+    }
+    setTimeout(blackboxRefresh, 300);
+  });
+};
+window.blackboxRefresh = blackboxRefresh;
+blackboxRefresh();
+})();
+</script>`
+}
+
+func renderNvidiaSelfHealInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
+<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
+  <button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
+  <button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">&#8635; Refresh</button>
+</div>
+<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
+<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
+  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
+    <span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
+    <span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
+  </div>
+  <div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
+</div>
+<script>
+function nvidiaSelfHealShowResult(label, status, output) {
+  var out = document.getElementById('nvidia-self-heal-out');
+  var term = document.getElementById('nvidia-self-heal-terminal');
+  var statusEl = document.getElementById('nvidia-self-heal-out-status');
+  var labelEl = document.getElementById('nvidia-self-heal-out-label');
+  out.style.display = 'block';
+  labelEl.textContent = label;
+  term.textContent = output || '(no output)';
+  term.scrollTop = term.scrollHeight;
+  if (status === 'ok') {
+    statusEl.textContent = '✓ done';
+    statusEl.style.color = 'var(--ok-fg, #2c662d)';
+  } else {
+    statusEl.textContent = '✗ failed';
+    statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+  }
+}
+function nvidiaRestartDrivers() {
+  var btn = document.getElementById('nvidia-restart-btn');
+  var original = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = 'Restarting...';
+  nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
+  fetch('/api/services/action', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body:JSON.stringify({name:'bee-nvidia', action:'restart'})
+  }).then(r=>r.json()).then(d => {
+    nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
+    setTimeout(function() {
+      loadServices();
+      loadNvidiaSelfHeal();
+    }, 800);
+  }).catch(e => {
+    nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
+  }).finally(() => {
+    btn.disabled = false;
+    btn.textContent = original;
+  });
+}
+function nvidiaResetGPU(index, btn) {
+  var original = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = 'Resetting...';
+  nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
+  fetch('/api/gpu/nvidia-reset', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body:JSON.stringify({index:index})
+  }).then(r=>r.json()).then(d => {
+    nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
+    setTimeout(loadNvidiaSelfHeal, 1000);
+  }).catch(e => {
+    nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
+  }).finally(() => {
+    btn.disabled = false;
+    btn.textContent = original;
+  });
+}
+function loadNvidiaSelfHeal() {
+  var status = document.getElementById('nvidia-self-heal-status');
+  var table = document.getElementById('nvidia-self-heal-table');
+  status.textContent = 'Loading NVIDIA GPU status...';
+  status.style.color = 'var(--muted)';
+  table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
+  fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
+    if (!Array.isArray(gpus) || gpus.length === 0) {
+      status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
+      table.innerHTML = '';
+      return;
+    }
+    status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
+    const rows = gpus.map(g => {
+      const serial = g.serial || '';
+      const bdf = g.bdf || '';
+      const id = serial || bdf || ('gpu-' + g.index);
+      const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
+      const details = [];
+      if (serial) details.push('serial ' + serial);
+      if (bdf) details.push('bdf ' + bdf);
+      if (g.parse_failure && g.raw_line) details.push(g.raw_line);
+      return '<tr>'
+        + '<td style="white-space:nowrap">' + g.index + '</td>'
+        + '<td>' + (g.name || 'unknown') + '</td>'
+        + '<td style="font-family:monospace">' + id + '</td>'
+        + '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
+        + (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
+        + '</td>'
+        + '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
+        + '</tr>';
+    }).join('');
+    table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
+  }).catch(e => {
+    status.textContent = 'Error loading NVIDIA GPU status: ' + e;
+    status.style.color = 'var(--crit-fg, #9f3a38)';
+    table.innerHTML = '';
+  });
+}
+loadNvidiaSelfHeal();
+</script>`
+}
+
+func renderTools() string {
+	return `<div class="card" style="margin-bottom:16px">
+  <div class="card-head">System Install</div>
+  <div class="card-body">
+    <div style="margin-bottom:20px">
+    <div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
+    <p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
+    <p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
+    <button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">&#9654; Copy to RAM</button>
+    </div>
+    <div style="border-top:1px solid var(--line);padding-top:20px">
+    <div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
+		renderInstallInline() + `
+    </div>
+  </div>
+</div>
+<script>
+fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
+  const boot = document.getElementById('boot-source-text');
+  const txt = document.getElementById('ram-status-text');
+  const btn = document.getElementById('ram-install-btn');
+  let source = d.device || d.source || 'unknown source';
+  let kind = d.kind || 'unknown';
+  let label = source;
+  if (kind === 'ram') label = 'RAM';
+  else if (kind === 'usb') label = 'USB (' + source + ')';
+  else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
+  else if (kind === 'disk') label = 'disk (' + source + ')';
+  else label = source;
+  boot.textContent = 'Current boot source: ' + label + '.';
+  txt.textContent = d.blocked_reason || d.message || 'Checking...';
+  if (d.status === 'ok' || d.in_ram) {
+    txt.style.color = 'var(--ok, green)';
+  } else if (d.status === 'failed') {
+    txt.style.color = 'var(--err, #b91c1c)';
+  } else {
+    txt.style.color = 'var(--muted)';
+  }
+  if (d.can_start_task) {
+    btn.style.display = '';
+    btn.disabled = false;
+  } else {
+    btn.style.display = 'none';
+  }
+});
+function installToRAM() {
+  document.getElementById('ram-install-btn').disabled = true;
+  fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
+    window.location.href = '/tasks#' + d.task_id;
+  });
+}
+</script>
+
+<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
+<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
+` + renderSupportBundleInline() + `
+<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
+  <div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
+  ` + renderUSBExportInline() + `
+</div>
+</div></div>
+
+<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div>
+<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
+
+<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
+		renderNvidiaSelfHealInline() + `</div></div>
+
+<div class="card"><div class="card-head">Network</div><div class="card-body">` +
+		renderNetworkInline() + `</div></div>
+
+<div class="card"><div class="card-head">Services</div><div class="card-body">` +
+		renderServicesInline() + `</div></div>
+
+` + renderNVMeFormatCard() + `
+
+` + renderSAADMICard() + `
+
+<script>
+function checkTools() {
+  document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
+  fetch('/api/tools/check').then(r=>r.json()).then(tools => {
+    const rows = tools.map(t =>
+      '<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '&#10003; '+t.Path : '&#10007; missing')+'</span></td></tr>'
+    ).join('');
+    document.getElementById('tools-table').innerHTML =
+      '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
+  });
+}
+checkTools();
+</script>`
+}
+
+func renderExportIndex(exportDir string) (string, error) {
+	entries, err := listExportFiles(exportDir)
+	if err != nil {
+		return "", err
+	}
+	var body strings.Builder
+	body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
+	body.WriteString(`<h1>Bee Export Files</h1><ul>`)
+	for _, entry := range entries {
+		body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
+	}
+	if len(entries) == 0 {
+		body.WriteString(`<li>No export files found.</li>`)
+	}
+	body.WriteString(`</ul></body></html>`)
+	return body.String(), nil
+}
--- a/audit/internal/webui/page_install_tasks.go
+++ b/audit/internal/webui/page_install_tasks.go
@@ -0,0 +1,314 @@
+package webui
+
+func renderInstallInline() string {
+	return `
+    <div class="alert alert-warn" style="margin-bottom:16px">
+      <strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
+      disk and write the live system onto it. All existing data on the target disk will be lost.
+      This operation cannot be undone.
+    </div>
+    <div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
+    <div id="install-disk-section" style="display:none">
+      <div class="card" style="margin-bottom:0">
+        <table id="install-disk-table">
+          <thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
+          <tbody id="install-disk-tbody"></tbody>
+        </table>
+      </div>
+      <div style="margin-top:12px">
+        <button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
+      </div>
+    </div>
+    <div id="install-confirm-section" style="display:none;margin-top:20px">
+      <div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
+      <div class="form-row" style="max-width:360px">
+        <label>Type the device name to confirm (e.g. /dev/sda)</label>
+        <input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
+      </div>
+      <button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
+      <button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
+    </div>
+    <div id="install-progress-section" style="display:none;margin-top:20px">
+      <div class="card-head" style="margin-bottom:8px">Installation Progress</div>
+      <div id="install-terminal" class="terminal" style="max-height:500px"></div>
+      <div id="install-status" style="margin-top:12px;font-size:13px"></div>
+    </div>
+
+<style>
+#install-disk-tbody tr{cursor:pointer}
+#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
+#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
+</style>
+
+<script>
+var _installSelected = null;
+
+function installRefreshDisks() {
+  document.getElementById('install-loading').style.display = '';
+  document.getElementById('install-disk-section').style.display = 'none';
+  document.getElementById('install-confirm-section').style.display = 'none';
+  _installSelected = null;
+  fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
+    document.getElementById('install-loading').style.display = 'none';
+    var tbody = document.getElementById('install-disk-tbody');
+    tbody.innerHTML = '';
+    if (!disks || disks.length === 0) {
+      tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
+    } else {
+      disks.forEach(function(d) {
+        var warnings = (d.warnings || []);
+        var statusHtml;
+        if (warnings.length === 0) {
+          statusHtml = '<span class="badge badge-ok">OK</span>';
+        } else {
+          var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
+          statusHtml = warnings.map(function(w){
+            var cls = hasSmall ? 'badge-err' : 'badge-warn';
+            return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'&quot;') + '">' +
+              (w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
+          }).join(' ');
+        }
+        var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
+          ? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
+        var tr = document.createElement('tr');
+        tr.dataset.device = d.device;
+        tr.dataset.model = d.model || 'Unknown';
+        tr.dataset.size = d.size;
+        tr.dataset.warnings = JSON.stringify(warnings);
+        tr.innerHTML =
+          '<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
+          '<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
+          '<td>' + (d.model || '—') + '</td>' +
+          '<td>' + d.size + '</td>' +
+          '<td>' + statusHtml + '</td>';
+        tr.addEventListener('click', function(){ installSelectDisk(this); });
+        tbody.appendChild(tr);
+      });
+    }
+    document.getElementById('install-disk-section').style.display = '';
+  }).catch(function(e){
+    document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
+  });
+}
+
+function installSelectDisk(tr) {
+  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
+  tr.classList.add('selected');
+  var radio = tr.querySelector('input[type=radio]');
+  if (radio) radio.checked = true;
+  _installSelected = {
+    device: tr.dataset.device,
+    model: tr.dataset.model,
+    size: tr.dataset.size,
+    warnings: JSON.parse(tr.dataset.warnings || '[]')
+  };
+  var warnBox = document.getElementById('install-confirm-warn');
+  var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
+    ' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
+    ' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
+  if (_installSelected.warnings.length > 0) {
+    warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
+  }
+  warnBox.innerHTML = warnLines;
+  document.getElementById('install-confirm-input').value = '';
+  document.getElementById('install-start-btn').disabled = true;
+  document.getElementById('install-confirm-section').style.display = '';
+  document.getElementById('install-progress-section').style.display = 'none';
+}
+
+function installDeselect() {
+  _installSelected = null;
+  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
+  document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
+  document.getElementById('install-confirm-section').style.display = 'none';
+}
+
+function installCheckConfirm() {
+  var val = document.getElementById('install-confirm-input').value.trim();
+  var ok = _installSelected && val === _installSelected.device;
+  document.getElementById('install-start-btn').disabled = !ok;
+}
+
+function installStart() {
+  if (!_installSelected) return;
+  document.getElementById('install-confirm-section').style.display = 'none';
+  document.getElementById('install-disk-section').style.display = 'none';
+  document.getElementById('install-loading').style.display = 'none';
+  var prog = document.getElementById('install-progress-section');
+  var term = document.getElementById('install-terminal');
+  var status = document.getElementById('install-status');
+  prog.style.display = '';
+  term.textContent = '';
+  status.textContent = 'Starting installation…';
+  status.style.color = 'var(--muted)';
+
+  fetch('/api/install/run', {
+    method: 'POST',
+    headers: {'Content-Type': 'application/json'},
+    body: JSON.stringify({device: _installSelected.device})
+  }).then(function(r){
+    return r.json().then(function(j){
+      if (!r.ok) throw new Error(j.error || r.statusText);
+      return j;
+    });
+  }).then(function(j){
+    if (!j.task_id) throw new Error('missing task id');
+    installStreamLog(j.task_id);
+  }).catch(function(e){
+    status.textContent = 'Error: ' + e;
+    status.style.color = 'var(--crit-fg)';
+  });
+}
+
+function installStreamLog(taskId) {
+  var term = document.getElementById('install-terminal');
+  var status = document.getElementById('install-status');
+  var es = new EventSource('/api/tasks/' + taskId + '/stream');
+  es.onmessage = function(e) {
+    term.textContent += e.data + '\n';
+    term.scrollTop = term.scrollHeight;
+  };
+  es.addEventListener('done', function(e) {
+    es.close();
+    if (!e.data) {
+      status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
+      var rebootBtn = document.createElement('button');
+      rebootBtn.className = 'btn btn-primary btn-sm';
+      rebootBtn.style.marginLeft = '12px';
+      rebootBtn.textContent = 'Reboot now';
+      rebootBtn.onclick = function(){
+        fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
+          body: JSON.stringify({name:'', action:'reboot'})});
+      };
+      status.appendChild(rebootBtn);
+    } else {
+      status.textContent = '✗ Installation failed: ' + e.data;
+      status.style.color = 'var(--crit-fg)';
+    }
+  });
+  es.onerror = function() {
+    es.close();
+    status.textContent = '✗ Stream disconnected.';
+    status.style.color = 'var(--crit-fg)';
+  };
+}
+
+installRefreshDisks();
+</script>
+`
+}
+
+func renderInstall() string {
+	return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
+		renderInstallInline() +
+		`</div></div>`
+}
+
+func renderTasks() string {
+	return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
+<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
+<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
+<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
+<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
+</div>
+<div class="card">
+<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
+</div>
+<script>
+var _taskRefreshTimer = null;
+var _tasksAll = [];
+var _taskPage = 1;
+var _taskPageSize = 50;
+
+function loadTasks() {
+  fetch('/api/tasks').then(r=>r.json()).then(tasks => {
+    _tasksAll = Array.isArray(tasks) ? tasks : [];
+    if (_tasksAll.length === 0) {
+      _taskPage = 1;
+      document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
+      return;
+    }
+    const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
+    if (_taskPage > totalPages) _taskPage = totalPages;
+    if (_taskPage < 1) _taskPage = 1;
+    const start = (_taskPage - 1) * _taskPageSize;
+    const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
+    const rows = pageTasks.map(t => {
+      const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
+      const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
+      const statusLabel = {running:'&#9654; running',pending:'pending',done:'&#10003; done',failed:'&#10007; failed',cancelled:'cancelled'}[t.status]||t.status;
+      let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
+      if (t.status === 'running' || t.status === 'pending') {
+        actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
+      }
+      if (t.status === 'pending') {
+        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">&#8679;</button>';
+        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">&#8681;</button>';
+      }
+      return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
+        '<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
+        '<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
+        '<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
+        '<td>'+t.priority+'</td>' +
+        '<td>'+actions+'</td></tr>';
+    }).join('');
+    const showingFrom = start + 1;
+    const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
+    const pager =
+      '<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
+        '<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
+        '<div style="display:flex;align-items:center;gap:8px">' +
+          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
+          '<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
+          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
+        '</div>' +
+      '</div>';
+    document.getElementById('tasks-table').innerHTML =
+      '<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
+  });
+}
+
+function escHtml(s) { return (s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;').replace(/"/g,'&quot;'); }
+function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
+function formatDurSec(sec) {
+  sec = Math.max(0, Math.round(sec||0));
+  if (sec < 60) return sec+'s';
+  const m = Math.floor(sec/60), ss = sec%60;
+  return m+'m '+ss+'s';
+}
+function setTaskPage(page) {
+  const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
+  _taskPage = Math.min(totalPages, Math.max(1, page));
+  loadTasks();
+}
+
+function cancelTask(id) {
+  fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
+}
+function cancelAll() {
+  fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
+}
+function killWorkers() {
+  if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
+  fetch('/api/tasks/kill-workers',{method:'POST'})
+    .then(r=>r.json())
+    .then(d=>{
+      loadTasks();
+      var toast = document.getElementById('kill-toast');
+      var parts = [];
+      if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
+      if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
+      toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
+      toast.style.display = '';
+      setTimeout(()=>{ toast.style.display='none'; }, 5000);
+    });
+}
+function setPriority(id, delta) {
+  fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
+    .then(()=>loadTasks());
+}
+
+loadTasks();
+_taskRefreshTimer = setInterval(loadTasks, 2000);
+</script>`
+}
--- a/audit/internal/webui/page_metrics.go
+++ b/audit/internal/webui/page_metrics.go
@@ -0,0 +1,238 @@
+package webui
+
+func renderMetrics() string {
+	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Server — Load</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Temperature — CPU</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
+  </div>
+</div>
+
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Temperature — Ambient Sensors</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Server — Power</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
+  </div>
+</div>
+
+<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
+  <div class="card-head">Server — Fan RPM</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
+  </div>
+</div>
+
+<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
+  <div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
+    <div>
+      <div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
+      <div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
+    </div>
+    <label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
+      <input id="gpu-chart-toggle" type="checkbox">
+      <span>One chart per GPU</span>
+    </label>
+  </div>
+
+  <div id="gpu-metrics-by-metric">
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Compute Load</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Memory Load</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Core Clock</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Power</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Temperature</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
+      </div>
+    </div>
+  </div>
+
+  <div id="gpu-metrics-by-gpu" style="display:none"></div>
+</section>
+
+<script>
+let gpuChartKey = '';
+const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
+let metricsNvidiaGPUsPromise = null;
+
+function loadMetricsNvidiaGPUs() {
+  if (!metricsNvidiaGPUsPromise) {
+    metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
+      .then(function(r) {
+        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
+        return r.json();
+      })
+      .then(function(list) { return Array.isArray(list) ? list : []; })
+      .catch(function() { return []; });
+  }
+  return metricsNvidiaGPUsPromise;
+}
+
+function metricsGPUNameMap(list) {
+  const out = {};
+  (list || []).forEach(function(gpu) {
+    const idx = Number(gpu.index);
+    if (!Number.isFinite(idx) || !gpu.name) return;
+    out[idx] = gpu.name;
+  });
+  return out;
+}
+
+function metricsGPUDisplayLabel(idx, names) {
+  const name = names && names[idx];
+  return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
+}
+
+function loadGPUChartModePreference() {
+  try {
+    return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
+  } catch (_) {
+    return false;
+  }
+}
+
+function saveGPUChartModePreference(perGPU) {
+  try {
+    sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
+  } catch (_) {}
+}
+
+function refreshChartImage(el) {
+  if (!el || el.dataset.loading === '1') return;
+  if (el.offsetParent === null) return;
+  const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
+  const nextSrc = baseSrc + '?t=' + Date.now();
+  const probe = new Image();
+  el.dataset.baseSrc = baseSrc;
+  el.dataset.loading = '1';
+  probe.onload = function() {
+    el.src = nextSrc;
+    el.dataset.loading = '0';
+  };
+  probe.onerror = function() {
+    el.dataset.loading = '0';
+  };
+  probe.src = nextSrc;
+}
+
+function refreshCharts() {
+  document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
+}
+
+function gpuIndices(rows) {
+  const seen = {};
+  const out = [];
+  (rows || []).forEach(function(row) {
+    const idx = Number(row.index);
+    if (!Number.isFinite(idx) || seen[idx]) return;
+    seen[idx] = true;
+    out.push(idx);
+  });
+  return out.sort(function(a, b) { return a - b; });
+}
+
+function renderGPUOverviewCards(indices, names) {
+  const host = document.getElementById('gpu-metrics-by-gpu');
+  if (!host) return;
+  host.innerHTML = indices.map(function(idx) {
+    const label = metricsGPUDisplayLabel(idx, names);
+    return '<div class="card" style="margin-bottom:16px">' +
+      '<div class="card-head">' + label + ' — Overview</div>' +
+      '<div class="card-body" style="padding:8px">' +
+      '<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
+      '</div></div>';
+  }).join('');
+}
+
+function applyGPUChartMode() {
+  const perMetric = document.getElementById('gpu-metrics-by-metric');
+  const perGPU = document.getElementById('gpu-metrics-by-gpu');
+  const toggle = document.getElementById('gpu-chart-toggle');
+  const gpuModePerGPU = !!(toggle && toggle.checked);
+  if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
+  if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
+}
+
+function syncMetricsLayout(d) {
+  const fanCard = document.getElementById('card-server-fans');
+  if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
+  const section = document.getElementById('gpu-metrics-section');
+  const summary = document.getElementById('gpu-metrics-summary');
+  const indices = gpuIndices(d.gpus);
+  loadMetricsNvidiaGPUs().then(function(gpus) {
+    const names = metricsGPUNameMap(gpus);
+    if (section) section.style.display = indices.length > 0 ? '' : 'none';
+    if (summary) {
+      summary.textContent = indices.length > 0
+        ? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
+        : 'No GPUs detected in live metrics.';
+    }
+    const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
+    if (nextKey !== gpuChartKey) {
+      renderGPUOverviewCards(indices, names);
+      gpuChartKey = nextKey;
+    }
+    applyGPUChartMode();
+  });
+}
+
+function loadMetricsLayout() {
+  fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
+}
+
+const gpuChartToggle = document.getElementById('gpu-chart-toggle');
+if (gpuChartToggle) {
+  gpuChartToggle.checked = loadGPUChartModePreference();
+}
+applyGPUChartMode();
+
+if (gpuChartToggle) {
+  gpuChartToggle.addEventListener('change', function() {
+    saveGPUChartModePreference(!!gpuChartToggle.checked);
+    applyGPUChartMode();
+    refreshCharts();
+  });
+}
+
+loadMetricsLayout();
+setInterval(refreshCharts, 3000);
+setInterval(loadMetricsLayout, 5000);
+</script>`
+}
--- a/audit/internal/webui/page_network_services.go
+++ b/audit/internal/webui/page_network_services.go
@@ -0,0 +1,213 @@
+package webui
+
+import "html"
+
+// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
+func renderNetworkInline() string {
+	return `<div id="net-pending" style="display:none" class="alert alert-warn">
+<strong>&#9888; Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
+<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
+<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
+</div>
+<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div class="grid2" style="margin-top:16px">
+<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
+<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
+<button class="btn btn-primary" onclick="runDHCP()">&#9654; Run DHCP</button>
+<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
+</div>
+<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
+<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
+<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
+<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
+<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
+<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
+<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
+<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
+</div>
+</div>
+<script>
+var _netCountdownTimer = null;
+var _netRefreshTimer = null;
+const NET_ROLLBACK_SECS = 60;
+function loadNetwork() {
+  fetch('/api/network').then(r=>r.json()).then(d => {
+    const rows = (d.interfaces||[]).map(i =>
+      '<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
+      '<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
+      '<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
+    ).join('');
+    document.getElementById('iface-table').innerHTML =
+      '<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
+      (d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
+    if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    else hideNetPending();
+  }).catch(function() {});
+}
+function selectIface(iface) {
+  document.getElementById('dhcp-iface').value = iface;
+  document.getElementById('st-iface').value = iface;
+}
+function toggleIface(iface, currentState) {
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
+    .then(r=>r.json()).then(d => {
+      if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
+      loadNetwork();
+      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    }).catch(function() {
+      setTimeout(loadNetwork, 1500);
+    });
+}
+function hideNetPending() {
+  const el = document.getElementById('net-pending');
+  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
+  _netCountdownTimer = null;
+  el.style.display = 'none';
+}
+function showNetPending(secs) {
+  if (!secs || secs < 1) { hideNetPending(); return; }
+  const el = document.getElementById('net-pending');
+  el.style.display = 'block';
+  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
+  let remaining = secs;
+  document.getElementById('net-countdown').textContent = remaining;
+  _netCountdownTimer = setInterval(function() {
+    remaining--;
+    document.getElementById('net-countdown').textContent = remaining;
+    if (remaining <= 0) { hideNetPending(); loadNetwork(); }
+  }, 1000);
+}
+function confirmNetChange() {
+  hideNetPending();
+  fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
+}
+function rollbackNetChange() {
+  hideNetPending();
+  fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
+}
+function runDHCP() {
+  const iface = document.getElementById('dhcp-iface').value.trim();
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
+    .then(r=>r.json()).then(d => {
+      document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
+      if (d.error) { hideNetPending(); return; }
+      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+      loadNetwork();
+    }).catch(function() {
+      setTimeout(loadNetwork, 1500);
+    });
+}
+function setStatic() {
+  const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
+    interface: document.getElementById('st-iface').value,
+    address: document.getElementById('st-addr').value,
+    prefix: document.getElementById('st-prefix').value,
+    gateway: document.getElementById('st-gw').value,
+    dns: dns,
+  })}).then(r=>r.json()).then(d => {
+    document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
+    if (d.error) { hideNetPending(); return; }
+    showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    loadNetwork();
+  }).catch(function() {
+    setTimeout(loadNetwork, 1500);
+  });
+}
+loadNetwork();
+if (_netRefreshTimer) clearInterval(_netRefreshTimer);
+_netRefreshTimer = setInterval(loadNetwork, 5000);
+</script>`
+}
+
+func renderNetwork() string {
+	return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
+		renderNetworkInline() +
+		`</div></div>`
+}
+
+func renderServicesInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
+<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
+<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div id="svc-out" style="display:none;margin-top:12px">
+  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
+    <span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
+    <span id="svc-out-status" style="font-size:12px"></span>
+  </div>
+  <div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
+</div>
+<script>
+function loadServices() {
+  fetch('/api/services').then(r=>r.json()).then(svcs => {
+    const rows = svcs.map(s => {
+      const st = s.state||'unknown';
+      const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
+      const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
+      const body = (s.body||'').replace(/</g,'&lt;').replace(/>/g,'&gt;');
+      return '<tr>' +
+        '<td style="white-space:nowrap">'+s.name+'</td>' +
+        '<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
+        '<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
+        '</td>' +
+        '<td style="white-space:nowrap">' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start"   onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop"    onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
+        '</td></tr>';
+    }).join('');
+    document.getElementById('svc-table').innerHTML =
+      '<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
+  });
+}
+function toggleBody(id) {
+  const el = document.getElementById(id);
+  if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
+}
+function svcAction(btn, name, action) {
+  var label = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = '...';
+  var out = document.getElementById('svc-out');
+  var term = document.getElementById('svc-terminal');
+  var statusEl = document.getElementById('svc-out-status');
+  var labelEl = document.getElementById('svc-out-label');
+  out.style.display = 'block';
+  labelEl.textContent = action + ' ' + name;
+  term.textContent = 'Running...';
+  statusEl.textContent = '';
+  statusEl.style.color = '';
+  fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
+    .then(r=>r.json()).then(d => {
+      term.textContent = d.output || d.error || '(no output)';
+      term.scrollTop = term.scrollHeight;
+      if (d.status === 'ok') {
+        statusEl.textContent = '✓ done';
+        statusEl.style.color = 'var(--ok-fg, #2c662d)';
+      } else {
+        statusEl.textContent = '✗ failed';
+        statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+      }
+      btn.textContent = label;
+      btn.disabled = false;
+      setTimeout(loadServices, 800);
+    }).catch(e => {
+      term.textContent = 'Request failed: ' + e;
+      statusEl.textContent = '✗ error';
+      statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+      btn.textContent = label;
+      btn.disabled = false;
+    });
+}
+loadServices();
+</script>`
+}
+
+func renderServices() string {
+	return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
+		renderServicesInline() +
+		`</div></div>`
+}
--- a/audit/internal/webui/page_settings.go
+++ b/audit/internal/webui/page_settings.go
@@ -0,0 +1,77 @@
+package webui
+
+import "html"
+
+func renderSettings(opts HandlerOptions) string {
+	version := opts.BuildLabel
+	if version == "" {
+		version = "dev"
+	}
+	return `<div class="grid2">
+
+<div class="card">
+  <div class="card-head">Blackbox Logging</div>
+  <div class="card-body">
+    <p style="font-size:13px;color:var(--muted);margin-bottom:14px">Continuous hardware monitoring that writes a rolling log of sensor readings to the export directory. Useful for capturing thermal or power anomalies during long runs.</p>
+    <div style="display:flex;gap:8px;align-items:center">
+      <button class="btn btn-primary btn-sm" onclick="blackboxToggle('enable')">Enable</button>
+      <button class="btn btn-secondary btn-sm" onclick="blackboxToggle('disable')">Disable</button>
+      <span id="blackbox-status" style="font-size:12px;color:var(--muted)">Loading...</span>
+    </div>
+  </div>
+</div>
+
+<div class="card">
+  <div class="card-head">NVIDIA Recovery</div>
+  <div class="card-body">
+    <p style="font-size:13px;color:var(--muted);margin-bottom:14px">Reset NVIDIA GPU driver state. Use when <code>nvidia-smi</code> reports errors or GPUs appear stuck after a failed test.</p>
+    <div style="display:flex;gap:8px;align-items:center">
+      <button class="btn btn-danger btn-sm" onclick="nvidiaReset()">Reset NVIDIA Driver</button>
+      <span id="nvidia-reset-status" style="font-size:12px;color:var(--muted)"></span>
+    </div>
+  </div>
+</div>
+
+</div>
+
+<div class="card" style="margin-top:0">
+  <div class="card-head">Build Info</div>
+  <div class="card-body">
+    <table style="width:auto">
+      <tbody>
+        <tr><td style="color:var(--muted);padding-right:24px">Version</td><td>` + html.EscapeString(version) + `</td></tr>
+        <tr><td style="color:var(--muted);padding-right:24px">Title</td><td>` + html.EscapeString(opts.Title) + `</td></tr>
+      </tbody>
+    </table>
+  </div>
+</div>
+
+<script>
+(function() {
+  fetch('/api/blackbox/status', {cache:'no-store'}).then(r => r.json()).then(d => {
+    var el = document.getElementById('blackbox-status');
+    if (el) el.textContent = d.enabled ? 'Enabled' : 'Disabled';
+  }).catch(() => {
+    var el = document.getElementById('blackbox-status');
+    if (el) el.textContent = 'Status unavailable';
+  });
+})();
+function blackboxToggle(action) {
+  var el = document.getElementById('blackbox-status');
+  if (el) el.textContent = 'Updating...';
+  fetch('/api/blackbox/' + action, {method:'POST', cache:'no-store'})
+    .then(r => r.json())
+    .then(d => { if (el) el.textContent = d.enabled ? 'Enabled' : 'Disabled'; })
+    .catch(err => { if (el) el.textContent = 'Error: ' + err.message; });
+}
+function nvidiaReset() {
+  var el = document.getElementById('nvidia-reset-status');
+  if (!confirm('Reset NVIDIA driver? This will interrupt any running GPU tasks.')) return;
+  if (el) el.textContent = 'Resetting...';
+  fetch('/api/gpu/nvidia-reset', {method:'POST', cache:'no-store'})
+    .then(r => r.json())
+    .then(d => { if (el) el.textContent = d.error ? ('Error: ' + d.error) : 'Done — driver reset.'; })
+    .catch(err => { if (el) el.textContent = 'Error: ' + err.message; });
+}
+</script>`
+}
--- a/audit/internal/webui/page_validate.go
+++ b/audit/internal/webui/page_validate.go
@@ -0,0 +1,952 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"sort"
+	"strings"
+
+	"bee/audit/internal/platform"
+	"bee/audit/internal/schema"
+)
+
+// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz).
+const (
+	pciVendorNvidia = 0x10de
+	pciVendorAMD    = 0x1002
+	pciVendorAspeed = 0x1a03
+)
+
+type validateInventory struct {
+	CPU            string
+	Memory         string
+	Storage        string
+	NVIDIA         string
+	AMD            string
+	NvidiaGPUCount int
+	AMDGPUCount    int
+}
+
+func validateFmtDur(secs int) string {
+	if secs < 120 {
+		return fmt.Sprintf("~%d s", secs)
+	}
+	mins := (secs + 29) / 60
+	return fmt.Sprintf("~%d min", mins)
+}
+
+func validateTotalValidateSec(n int) int {
+	if n < 0 {
+		n = 0
+	}
+	total := platform.SATEstimatedCPUValidateSec +
+		platform.SATEstimatedMemoryValidateSec +
+		platform.SATEstimatedNvidiaInterconnectSec +
+		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUValidateSec
+	}
+	return total
+}
+
+func validateTotalStressSec(n int) int {
+	if n < 0 {
+		n = 0
+	}
+	total := platform.SATEstimatedCPUStressSec +
+		platform.SATEstimatedMemoryStressSec +
+		platform.SATEstimatedNvidiaPulseTestSec +
+		platform.SATEstimatedNvidiaInterconnectSec +
+		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUStressSec +
+			platform.SATEstimatedNvidiaTargetedStressSec +
+			platform.SATEstimatedNvidiaTargetedPowerSec
+	}
+	return total
+}
+
+func renderValidate(opts HandlerOptions) string {
+	inv := loadValidateInventory(opts)
+	n := inv.NvidiaGPUCount
+	validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
+	stressTotalStr := validateFmtDur(validateTotalStressSec(n))
+	gpuNote := ""
+	if n > 0 {
+		gpuNote = fmt.Sprintf(" (%d GPU)", n)
+	}
+	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
+<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+	<div class="card" style="margin-bottom:16px">
+	  <div class="card-head">Validate Profile</div>
+	  <div class="card-body validate-profile-body">
+	    <div class="validate-profile-col">
+	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
+	    </div>
+	    <div class="validate-profile-col validate-profile-action">
+	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
+	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
+	      <div style="margin-top:12px">
+	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
+	      </div>
+	    </div>
+	  </div>
+	</div>
+
+<div class="grid3">
+` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
+		inv.CPU,
+		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
+		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
+		validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
+	)) +
+		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
+			inv.Memory,
+			`Runs a RAM validation pass and records memory state around the test.`,
+			`<code>free</code>, <code>memtester</code>`,
+			validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
+		)) +
+		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
+			inv.Storage,
+			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
+			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
+			`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
+		)) +
+		`</div>
+<div style="height:1px;background:var(--border);margin:16px 0"></div>
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">NVIDIA GPU Selection</div>
+  <div class="card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
+    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
+    </div>
+    <div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+    </div>
+    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
+  </div>
+</div>
+
+<div class="grid3">
+` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
+		inv.NVIDIA,
+		`Runs NVIDIA diagnostics and board inventory checks.`,
+		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
+		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
+			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
+			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
+	)) +
+		`<div id="sat-card-nvidia-targeted-stress">` +
+		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
+			`<code>dcgmi diag targeted_stress</code>`,
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-targeted-power">` +
+		renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
+			`<code>dcgmi diag targeted_power</code>`,
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-pulse">` +
+		renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
+			`<code>dcgmi diag pulse_test</code>`,
+			`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-interconnect">` +
+		renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
+			`<code>all_reduce_perf</code> (NCCL tests)`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-bandwidth">` +
+		renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
+			`<code>nvbandwidth</code>`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
+		)) +
+		`</div>` +
+		`</div>
+<div class="grid3" style="margin-top:16px">
+` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
+		inv.AMD,
+		`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
+		`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
+		`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
+	)) +
+		`</div>
+<div id="sat-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Test Output <span id="sat-title"></span></div>
+  <div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
+</div>
+<style>
+.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
+.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
+.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
+.validate-card-body { padding:0; }
+.validate-card-section { padding:12px 16px 0; }
+.validate-card-section:last-child { padding-bottom:16px; }
+.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
+</style>
+<script>
+let satES = null;
+function satStressMode() {
+  return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
+}
+function satModeChanged() {
+  const stress = satStressMode();
+  [
+    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
+    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
+    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
+  ].forEach(function(item) {
+    const card = document.getElementById(item.card);
+    if (card) {
+      card.style.opacity = stress ? '1' : '0.5';
+      const hint = document.getElementById(item.hint);
+      if (hint) hint.style.display = stress ? 'none' : '';
+    }
+  });
+}
+function satLabels() {
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+}
+let satNvidiaGPUsPromise = null;
+function loadSatNvidiaGPUs() {
+  if (!satNvidiaGPUsPromise) {
+    satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
+      .then(r => {
+        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
+        return r.json();
+      })
+      .then(list => Array.isArray(list) ? list : []);
+  }
+  return satNvidiaGPUsPromise;
+}
+function satSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function satUpdateGPUSelectionNote() {
+  const note = document.getElementById('sat-gpu-selection-note');
+  if (!note) return;
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) {
+    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
+    return;
+  }
+  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
+}
+function satRenderGPUList(gpus) {
+  const root = document.getElementById('sat-gpu-list');
+  if (!root) return;
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    satUpdateGPUSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="sat-gpu-row">'
+      + '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  satUpdateGPUSelectionNote();
+}
+function satSelectAllGPUs() {
+  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
+  satUpdateGPUSelectionNote();
+}
+function satSelectNoGPUs() {
+  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
+  satUpdateGPUSelectionNote();
+}
+function satLoadGPUs() {
+  loadSatNvidiaGPUs().then(function(gpus) {
+    satRenderGPUList(gpus);
+  }).catch(function(err) {
+    const root = document.getElementById('sat-gpu-list');
+    if (root) {
+      root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    }
+    satUpdateGPUSelectionNote();
+  });
+}
+function satGPUDisplayName(gpu) {
+  const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
+  const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
+  return 'GPU ' + idx + ' — ' + name;
+}
+function satRequestBody(target, overrides) {
+  const body = {};
+  const labels = satLabels();
+  body.display_name = labels[target] || ('Validate ' + target);
+  body.stress_mode = satStressMode();
+  if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
+  if (overrides) {
+    Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
+  }
+  return body;
+}
+function enqueueSATTarget(target, overrides) {
+  return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
+    .then(r => r.json());
+}
+function streamSATTask(taskId, title, resetTerminal) {
+  if (satES) { satES.close(); satES = null; }
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  const term = document.getElementById('sat-terminal');
+  if (resetTerminal) {
+    term.textContent = '';
+  }
+  term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
+  return new Promise(function(resolve) {
+    satES = new EventSource('/api/tasks/' + taskId + '/stream');
+    satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    satES.addEventListener('done', function(e) {
+      satES.close();
+      satES = null;
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: !e.data, error: e.data || ''});
+    });
+    satES.onerror = function() {
+      if (satES) {
+        satES.close();
+        satES = null;
+      }
+      term.textContent += '\nERROR: stream disconnected.\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: false, error: 'stream disconnected'});
+    };
+  });
+}
+function selectedAMDValidateTargets() {
+  const targets = [];
+  const gpu = document.getElementById('sat-amd-target');
+  const mem = document.getElementById('sat-amd-mem-target');
+  const bw = document.getElementById('sat-amd-bandwidth-target');
+  if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
+  if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
+  if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
+  return targets;
+}
+function runSAT(target) {
+  return runSATWithOverrides(target, null);
+}
+function runSATWithOverrides(target, overrides) {
+  const title = (overrides && overrides.display_name) || target;
+  const term = document.getElementById('sat-terminal');
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  term.textContent = 'Enqueuing ' + title + ' test...\n';
+  return enqueueSATTarget(target, overrides)
+    .then(d => streamSATTask(d.task_id, title, false));
+}
+const nvidiaPerGPUTargets = [];
+const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+function satAllGPUIndicesForMulti() {
+  return Promise.resolve(satSelectedGPUIndices());
+}
+function expandSATTarget(target) {
+  if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
+    return satAllGPUIndicesForMulti().then(function(indices) {
+      if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
+      return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
+    });
+  }
+  if (nvidiaPerGPUTargets.indexOf(target) < 0) {
+    return Promise.resolve([{target: target}]);
+  }
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) {
+    return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
+  }
+  return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
+    target: target,
+    overrides: {
+      gpu_indices: [Number(gpu.index)],
+      display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
+    },
+    label: satGPUDisplayName(gpu),
+  })));
+}
+function runNvidiaFabricValidate(target) {
+  satAllGPUIndicesForMulti().then(function(indices) {
+    if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
+    runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
+  });
+}
+function runNvidiaValidateSet(target) {
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
+  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
+}
+function runAMDValidateSet() {
+  const targets = selectedAMDValidateTargets();
+  if (!targets.length) return;
+  if (targets.length === 1) return runSAT(targets[0]);
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— amd';
+  const term = document.getElementById('sat-terminal');
+  term.textContent = 'Running AMD validate set one by one...\n';
+  const labels = satLabels();
+  const runNext = (idx) => {
+    if (idx >= targets.length) return Promise.resolve();
+    const target = targets[idx];
+    term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
+    return enqueueSATTarget(target)
+      .then(d => {
+        return streamSATTask(d.task_id, labels[target], false);
+      }).then(function() {
+        return runNext(idx + 1);
+      });
+  };
+  return runNext(0);
+}
+function runAllSAT() {
+  const cycles = 1;
+  const status = document.getElementById('sat-all-status');
+  status.textContent = 'Enqueuing...';
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
+  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
+  const activeTargets = baseTargets.filter(target => {
+    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
+    const btn = document.getElementById('sat-btn-' + target);
+    return !(btn && btn.disabled);
+  });
+  Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
+    const expanded = [];
+    for (let cycle = 0; cycle < cycles; cycle++) {
+      groups.forEach(group => group.forEach(item => expanded.push(item)));
+    }
+    const total = expanded.length;
+    let enqueued = 0;
+    if (!total) {
+      status.textContent = 'No tasks selected.';
+      return;
+    }
+    const runNext = (idx) => {
+      if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
+      const item = expanded[idx];
+      status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
+      return enqueueSATTarget(item.target, item.overrides)
+        .then(() => {
+          enqueued++;
+          return runNext(idx + 1);
+        });
+    };
+    return runNext(0);
+  }).catch(err => {
+    status.textContent = 'Error: ' + err.message;
+  });
+}
+</script>
+<script>
+fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
+    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
+    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
+    if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
+});
+satLoadGPUs();
+function disableSATAMDOptions(reason) {
+    ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
+        const cb = document.getElementById(id);
+        if (!cb) return;
+        cb.disabled = true;
+        cb.checked = false;
+        cb.title = reason;
+    });
+}
+function disableSATCard(id, reason) {
+    const btn = document.getElementById('sat-btn-' + id);
+    if (!btn) return;
+    btn.disabled = true;
+    btn.title = reason;
+    btn.style.opacity = '0.4';
+    const card = btn.closest('.card');
+    if (card) {
+        let note = card.querySelector('.sat-unavail');
+        if (!note) {
+            note = document.createElement('p');
+            note.className = 'sat-unavail';
+            note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
+            const body = card.querySelector('.card-body');
+            if (body) body.insertBefore(note, body.firstChild);
+        }
+        note.textContent = reason;
+    }
+}
+</script>`
+}
+
+func loadValidateInventory(opts HandlerOptions) validateInventory {
+	unknown := "Audit snapshot not loaded."
+	out := validateInventory{
+		CPU:     unknown,
+		Memory:  unknown,
+		Storage: unknown,
+		NVIDIA:  unknown,
+		AMD:     unknown,
+	}
+	data, err := loadSnapshot(opts.AuditPath)
+	if err != nil {
+		return out
+	}
+	var snap schema.HardwareIngestRequest
+	if err := json.Unmarshal(data, &snap); err != nil {
+		return out
+	}
+
+	cpuCounts := map[string]int{}
+	cpuTotal := 0
+	for _, cpu := range snap.Hardware.CPUs {
+		if cpu.Present != nil && !*cpu.Present {
+			continue
+		}
+		cpuTotal++
+		addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
+	}
+
+	memCounts := map[string]int{}
+	memTotal := 0
+	for _, dimm := range snap.Hardware.Memory {
+		if dimm.Present != nil && !*dimm.Present {
+			continue
+		}
+		memTotal++
+		addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
+	}
+
+	storageCounts := map[string]int{}
+	storageTotal := 0
+	for _, dev := range snap.Hardware.Storage {
+		if dev.Present != nil && !*dev.Present {
+			continue
+		}
+		storageTotal++
+		addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+	}
+
+	nvidiaCounts := map[string]int{}
+	nvidiaTotal := 0
+	amdCounts := map[string]int{}
+	amdTotal := 0
+	for _, dev := range snap.Hardware.PCIeDevices {
+		if dev.Present != nil && !*dev.Present {
+			continue
+		}
+		if validateIsVendorGPU(dev, "nvidia") {
+			nvidiaTotal++
+			addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+		}
+		if validateIsVendorGPU(dev, "amd") {
+			amdTotal++
+			addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+		}
+	}
+
+	out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
+	out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
+	out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
+	out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
+	out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
+	out.NvidiaGPUCount = nvidiaTotal
+	out.AMDGPUCount = amdTotal
+	return out
+}
+
+func renderValidateCardBody(devices, description, commands, settings string) string {
+	return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
+}
+
+func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
+	if total == 0 {
+		return "0 " + unit + "s detected."
+	}
+	keys := make([]string, 0, len(models))
+	for key := range models {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	parts := make([]string, 0, len(keys))
+	for _, key := range keys {
+		parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
+	}
+	label := unit
+	if total != 1 {
+		label += "s"
+	}
+	if len(parts) == 1 {
+		return parts[0] + " " + label
+	}
+	return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
+}
+
+func addValidateModel(counts map[string]int, name string) {
+	name = strings.TrimSpace(name)
+	if name == "" {
+		name = "unknown"
+	}
+	counts[name]++
+}
+
+func validateTrimPtr(value *string) string {
+	if value == nil {
+		return ""
+	}
+	return strings.TrimSpace(*value)
+}
+
+func validateFirstNonEmpty(values ...string) string {
+	for _, value := range values {
+		value = strings.TrimSpace(value)
+		if value != "" {
+			return value
+		}
+	}
+	return ""
+}
+
+func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
+	if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed {
+		return false
+	}
+	class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
+	isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
+	switch vendor {
+	case "nvidia":
+		return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia
+	case "amd":
+		return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD
+	default:
+		return false
+	}
+}
+
+// renderCheck renders the non-destructive Check page (step 2).
+// Shows validate-mode tests only: CPU, Memory, Storage, NVIDIA L2, NCCL, NVBandwidth, AMD.
+// Stress-mode tests (targeted-stress, targeted-power, pulse) are on the Load page.
+func renderCheck(opts HandlerOptions) string {
+	inv := loadValidateInventory(opts)
+	n := inv.NvidiaGPUCount
+	validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
+	gpuNote := ""
+	if n > 0 {
+		gpuNote = fmt.Sprintf(" (%d GPU)", n)
+	}
+	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/load">3. Load</a>.</div>
+<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
+  <button type="button" class="btn btn-primary" onclick="runAllCheckSAT()">Run All Checks</button>
+  <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
+  <span style="font-size:12px;color:var(--muted)">est. ` + validateTotalStr + gpuNote + `</span>
+</div>
+
+<div class="grid3">
+` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
+		inv.CPU,
+		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
+		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
+		validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`,
+	)) +
+		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
+			inv.Memory,
+			`Runs a RAM validation pass and records memory state around the test.`,
+			`<code>free</code>, <code>memtester</code>`,
+			validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`,
+		)) +
+		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
+			inv.Storage,
+			`Scans all storage devices and runs the matching health or self-test path for each.`,
+			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
+			`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
+		)) +
+		`</div>
+<div style="height:1px;background:var(--border);margin:16px 0"></div>
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">NVIDIA GPU Selection</div>
+  <div class="card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
+    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
+    </div>
+    <div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+    </div>
+    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA check tasks.</p>
+  </div>
+</div>
+
+<div class="grid3">
+` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
+		inv.NVIDIA,
+		`Runs NVIDIA diagnostics and board inventory checks (DCGM Level 2).`,
+		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
+		validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`,
+	)) +
+		renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`,
+			`<code>all_reduce_perf</code> (NCCL tests)`,
+			validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
+		)) +
+		renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
+			`<code>nvbandwidth</code>`,
+			validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`,
+		)) +
+		`</div>
+<div class="grid3" style="margin-top:16px">
+` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
+		inv.AMD,
+		`Runs AMD GPU inventory, MEM integrity, and MEM bandwidth checks.`,
+		`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
+		`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
+	)) +
+		`</div>
+<div id="sat-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Test Output <span id="sat-title"></span></div>
+  <div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
+</div>
+<style>
+.validate-card-body { padding:0; }
+.validate-card-section { padding:12px 16px 0; }
+.validate-card-section:last-child { padding-bottom:16px; }
+.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
+.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+</style>
+<script>
+let satES = null;
+function satLabels() {
+  return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+}
+let satNvidiaGPUsPromise = null;
+function loadSatNvidiaGPUs() {
+  if (!satNvidiaGPUsPromise) {
+    satNvidiaGPUsPromise = fetch('/api/gpu/nvidia').then(r => {
+      if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
+      return r.json();
+    }).then(list => Array.isArray(list) ? list : []);
+  }
+  return satNvidiaGPUsPromise;
+}
+function satSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
+    .filter(el => el.checked && !el.disabled)
+    .map(el => parseInt(el.value, 10))
+    .filter(v => !Number.isNaN(v))
+    .sort((a, b) => a - b);
+}
+function satUpdateGPUSelectionNote() {
+  const note = document.getElementById('sat-gpu-selection-note');
+  if (!note) return;
+  const sel = satSelectedGPUIndices();
+  note.textContent = sel.length
+    ? 'Selected GPUs: ' + sel.join(', ') + '. Multi-GPU tests will use all selected GPUs.'
+    : 'Select at least one NVIDIA GPU to enable NVIDIA check tasks.';
+}
+function satRenderGPUList(gpus) {
+  const root = document.getElementById('sat-gpu-list');
+  if (!root) return;
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    satUpdateGPUSelectionNote(); return;
+  }
+  root.innerHTML = gpus.map(gpu => {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="sat-gpu-row"><input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()"><span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span></label>';
+  }).join('');
+  satUpdateGPUSelectionNote();
+}
+function satSelectAllGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = true; }); satUpdateGPUSelectionNote(); }
+function satSelectNoGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = false; }); satUpdateGPUSelectionNote(); }
+function satGPULoadInit() {
+  loadSatNvidiaGPUs().then(satRenderGPUList).catch(err => {
+    const root = document.getElementById('sat-gpu-list');
+    if (root) root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    satUpdateGPUSelectionNote();
+  });
+}
+function satRequestBody(target, overrides) {
+  const body = {};
+  const labels = satLabels();
+  body.display_name = labels[target] || ('Check ' + target);
+  body.stress_mode = false;
+  if (target === 'cpu') body.duration = 60;
+  if (overrides) Object.keys(overrides).forEach(k => { body[k] = overrides[k]; });
+  return body;
+}
+function enqueueSATTarget(target, overrides) {
+  return fetch('/api/sat/' + target + '/run', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(satRequestBody(target, overrides))}).then(r => r.json());
+}
+function streamSATTask(taskId, title, resetTerminal) {
+  if (satES) { satES.close(); satES = null; }
+  document.getElementById('sat-output').style.display = 'block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  const term = document.getElementById('sat-terminal');
+  if (resetTerminal) term.textContent = '';
+  term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
+  return new Promise(resolve => {
+    satES = new EventSource('/api/tasks/' + taskId + '/stream');
+    satES.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    satES.addEventListener('done', e => {
+      satES.close(); satES = null;
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: !e.data, error: e.data || ''});
+    });
+    satES.onerror = () => {
+      if (satES) { satES.close(); satES = null; }
+      term.textContent += '\nERROR: stream disconnected.\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: false, error: 'stream disconnected'});
+    };
+  });
+}
+function selectedAMDValidateTargets() {
+  const targets = [];
+  const gpu = document.getElementById('sat-amd-target');
+  const mem = document.getElementById('sat-amd-mem-target');
+  const bw = document.getElementById('sat-amd-bandwidth-target');
+  if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
+  if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
+  if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
+  return targets;
+}
+function runSAT(target) { return runSATWithOverrides(target, null); }
+function runSATWithOverrides(target, overrides) {
+  const title = (overrides && overrides.display_name) || target;
+  document.getElementById('sat-output').style.display = 'block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  const term = document.getElementById('sat-terminal');
+  term.textContent = 'Enqueuing ' + title + ' test...\n';
+  return enqueueSATTarget(target, overrides).then(d => streamSATTask(d.task_id, title, false));
+}
+function runNvidiaFabricValidate(target) {
+  const indices = satSelectedGPUIndices();
+  if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
+  runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
+}
+function runNvidiaValidateSet(target) {
+  const sel = satSelectedGPUIndices();
+  if (!sel.length) { alert('Select at least one NVIDIA GPU.'); return; }
+  return runSATWithOverrides(target, {gpu_indices: sel, display_name: satLabels()[target] || target});
+}
+function runAMDValidateSet() {
+  const targets = selectedAMDValidateTargets();
+  if (!targets.length) return;
+  if (targets.length === 1) return runSAT(targets[0]);
+  const term = document.getElementById('sat-terminal');
+  document.getElementById('sat-output').style.display = 'block';
+  document.getElementById('sat-title').textContent = '— amd';
+  term.textContent = 'Running AMD check set...\n';
+  const labels = satLabels();
+  const runNext = idx => {
+    if (idx >= targets.length) return Promise.resolve();
+    const t = targets[idx];
+    term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[t] + '\n';
+    return enqueueSATTarget(t).then(d => streamSATTask(d.task_id, labels[t], false)).then(() => runNext(idx + 1));
+  };
+  return runNext(0);
+}
+function runAllCheckSAT() {
+  const status = document.getElementById('sat-all-status');
+  status.textContent = 'Enqueuing...';
+  const nvidiaIndices = satSelectedGPUIndices();
+  const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
+  const baseTargets = ['cpu', 'memory', 'storage'];
+  const amdTargets = selectedAMDValidateTargets();
+  const expanded = [];
+  baseTargets.forEach(t => expanded.push({target: t}));
+  if (nvidiaIndices.length) {
+    nvidiaAllTargets.forEach(t => {
+      const btn = document.getElementById('sat-btn-' + t);
+      if (!(btn && btn.disabled)) expanded.push({target: t, overrides: {gpu_indices: nvidiaIndices, display_name: satLabels()[t] || t}});
+    });
+  }
+  amdTargets.forEach(t => expanded.push({target: t}));
+  if (!expanded.length) { status.textContent = 'No tasks selected.'; return; }
+  const total = expanded.length;
+  const runNext = idx => {
+    if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
+    const item = expanded[idx];
+    status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
+    return enqueueSATTarget(item.target, item.overrides).then(() => runNext(idx + 1));
+  };
+  runNext(0).catch(err => { status.textContent = 'Error: ' + err.message; });
+}
+function disableSATCard(id, reason) {
+  const btn = document.getElementById('sat-btn-' + id);
+  if (!btn) return;
+  btn.disabled = true; btn.title = reason; btn.style.opacity = '0.4';
+  const card = btn.closest('.card');
+  if (card) {
+    let note = card.querySelector('.sat-unavail');
+    if (!note) {
+      note = document.createElement('p');
+      note.className = 'sat-unavail';
+      note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
+      const body = card.querySelector('.card-body');
+      if (body) body.insertBefore(note, body.firstChild);
+    }
+    note.textContent = reason;
+  }
+}
+fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
+  if (!gp.nvidia) ['nvidia','nvidia-interconnect','nvidia-bandwidth'].forEach(t => disableSATCard(t, 'No NVIDIA GPU detected'));
+  if (!gp.amd) {
+    disableSATCard('amd', 'No AMD GPU detected');
+    ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(id => {
+      const cb = document.getElementById(id);
+      if (cb) { cb.disabled = true; cb.checked = false; }
+    });
+  }
+});
+satGPULoadInit();
+</script>`
+}
+
+func renderSATCard(id, label, runAction, headerActions, body string) string {
+	actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
+	if strings.TrimSpace(headerActions) != "" {
+		actions += headerActions
+	}
+	return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
+		label, actions, body)
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/saa_dmi.go
+++ b/audit/internal/webui/saa_dmi.go
@@ -0,0 +1,301 @@
+package webui
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"time"
+)
+
+type dmiField struct {
+	Name  string `json:"name"`
+	Shn   string `json:"shn"`
+	Value string `json:"value"`
+}
+
+type saaChange struct {
+	Shn   string `json:"shn"`
+	Value string `json:"value"`
+}
+
+var (
+	shnRE        = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`)
+	dmiSectionRE = regexp.MustCompile(`^\[(.+?)\]$`)
+	// Item Name   {SHN}   = value   // comment
+	dmiItemRE    = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9]{1,16})\}\s*=\s*(.*)$`)
+	dmiVersionRE = regexp.MustCompile(`(?i)^version\s*=`)
+)
+
+// parseDMIFile parses the DMI.txt produced by "saa GetDmiInfo".
+// Real format (from SAA User Guide 4.8.1):
+//
+//	[System]
+//	Version         {SYVS} = "A Version"   // string value
+//	Serial Number   {SYSN} = $DEFAULT$     // string value
+//	UUID            {SYUU} = 00112233-...  // hex value
+func parseDMIFile(content string) []dmiField {
+	var fields []dmiField
+	currentSection := ""
+	for _, line := range strings.Split(content, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.HasPrefix(line, "//") || strings.HasPrefix(line, "#") {
+			continue
+		}
+		if dmiVersionRE.MatchString(line) {
+			continue
+		}
+		if m := dmiSectionRE.FindStringSubmatch(line); m != nil {
+			currentSection = strings.TrimSpace(m[1])
+			continue
+		}
+		m := dmiItemRE.FindStringSubmatch(line)
+		if m == nil {
+			continue
+		}
+		itemName := strings.TrimSpace(m[1])
+		shn := m[2]
+		rawValue := strings.TrimSpace(m[3])
+		// strip trailing comment (space + //)
+		if idx := strings.LastIndex(rawValue, " //"); idx >= 0 {
+			rawValue = strings.TrimSpace(rawValue[:idx])
+		}
+		// strip surrounding double quotes from string values
+		if len(rawValue) >= 2 && rawValue[0] == '"' && rawValue[len(rawValue)-1] == '"' {
+			rawValue = rawValue[1 : len(rawValue)-1]
+		}
+		displayName := itemName
+		if currentSection != "" {
+			displayName = currentSection + " / " + itemName
+		}
+		fields = append(fields, dmiField{Name: displayName, Shn: shn, Value: rawValue})
+	}
+	return fields
+}
+
+func (h *handler) handleAPISAADMIRead(w http.ResponseWriter, r *http.Request) {
+	ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
+	defer cancel()
+
+	tmpDir, err := os.MkdirTemp("", "bee-saa-*")
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, "create temp dir: "+err.Error())
+		return
+	}
+	defer os.RemoveAll(tmpDir)
+
+	dmiFile := filepath.Join(tmpDir, "DMI.txt")
+	out, err := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite").CombinedOutput()
+	if err != nil {
+		msg := strings.TrimSpace(string(out))
+		if msg == "" {
+			msg = err.Error()
+		}
+		writeError(w, http.StatusInternalServerError, "saa GetDmiInfo: "+msg)
+		return
+	}
+
+	raw, err := os.ReadFile(dmiFile)
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, "read DMI file: "+err.Error())
+		return
+	}
+
+	fields := parseDMIFile(string(raw))
+	if len(fields) == 0 {
+		writeError(w, http.StatusInternalServerError, "no DMI fields found (file may be empty — reboot the server and try again)")
+		return
+	}
+	writeJSON(w, fields)
+}
+
+func (h *handler) handleAPISAADMIWrite(w http.ResponseWriter, r *http.Request) {
+	var req struct {
+		Changes []saaChange `json:"changes"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	if len(req.Changes) == 0 {
+		writeError(w, http.StatusUnprocessableEntity, "no changes provided")
+		return
+	}
+	for _, c := range req.Changes {
+		if !shnRE.MatchString(c.Shn) {
+			writeError(w, http.StatusUnprocessableEntity, "invalid shn: "+c.Shn)
+			return
+		}
+		if len(c.Value) == 0 || len(c.Value) > 64 {
+			writeError(w, http.StatusUnprocessableEntity, "value length out of range for shn: "+c.Shn)
+			return
+		}
+		for _, ch := range c.Value {
+			if ch < 0x20 || ch > 0x7E {
+				writeError(w, http.StatusUnprocessableEntity, "value contains non-printable character for shn: "+c.Shn)
+				return
+			}
+		}
+	}
+
+	t := &Task{
+		ID:        newJobID("saa-dmi-write"),
+		Name:      fmt.Sprintf("SAA DMI Write (%d field(s))", len(req.Changes)),
+		Target:    "saa-dmi-write",
+		Priority:  defaultTaskPriority("saa-dmi-write", taskParams{}),
+		Status:    TaskPending,
+		CreatedAt: time.Now(),
+		params: taskParams{
+			SAADmiChanges: req.Changes,
+		},
+	}
+	globalQueue.enqueue(t)
+	writeJSON(w, map[string]string{"task_id": t.ID})
+}
+
+func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
+	tmpDir, err := os.MkdirTemp("", "bee-saa-*")
+	if err != nil {
+		return fmt.Errorf("create temp dir: %w", err)
+	}
+	defer os.RemoveAll(tmpDir)
+	dmiFile := filepath.Join(tmpDir, "DMI.txt")
+
+	j.append("Reading current DMI configuration...")
+	if err := streamCmdJob(j, exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")); err != nil {
+		return fmt.Errorf("GetDmiInfo: %w", err)
+	}
+
+	backupDir := filepath.Join(exportDir, "dmi-backups")
+	if err := os.MkdirAll(backupDir, 0o755); err != nil {
+		return fmt.Errorf("create backup dir: %w", err)
+	}
+	backupName := "dmi-" + time.Now().UTC().Format("20060102-150405") + ".txt"
+	backupPath := filepath.Join(backupDir, backupName)
+	raw, err := os.ReadFile(dmiFile)
+	if err != nil {
+		return fmt.Errorf("read DMI file: %w", err)
+	}
+	if err := os.WriteFile(backupPath, raw, 0o644); err != nil {
+		return fmt.Errorf("write backup: %w", err)
+	}
+	j.append("Backup saved: dmi-backups/" + backupName)
+
+	for _, c := range p.SAADmiChanges {
+		j.append("Setting " + c.Shn + " = " + c.Value)
+		cmd := exec.CommandContext(ctx, "saa", "-c", "EditDmiInfo", "--file", dmiFile, "--shn", c.Shn, "--value", c.Value)
+		if err := streamCmdJob(j, cmd); err != nil {
+			return fmt.Errorf("EditDmiInfo %s: %w", c.Shn, err)
+		}
+	}
+
+	j.append("Applying changes to hardware...")
+	if err := streamCmdJob(j, exec.CommandContext(ctx, "saa", "-c", "ChangeDmiInfo", "--file", dmiFile)); err != nil {
+		return fmt.Errorf("ChangeDmiInfo: %w", err)
+	}
+
+	j.append("Done. Reboot the server for changes to take effect.")
+	return nil
+}
+
+func renderSAADMICard() string {
+	return `<div class="card"><div class="card-head">SAA &#8212; DMI <button class="btn btn-sm btn-secondary" onclick="saaDMIRead()" style="margin-left:auto">Read</button></div><div class="card-body">
+<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits DMI fields via SAA (In-Band). Requires <code>saa</code> on PATH.</p>
+<div id="saa-dmi-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
+<div id="saa-dmi-table"></div>
+<div id="saa-dmi-save-row" style="display:none;margin-top:12px">
+  <button class="btn btn-primary" id="saa-dmi-save-btn" onclick="saaDMISave()">Save</button>
+  <span id="saa-dmi-save-msg" style="font-size:13px;color:var(--muted);margin-left:10px"></span>
+</div>
+<script>
+function saaDMIEsc(s) {
+  return String(s==null?'':s).replace(/[&<>"']/g,function(c){return{'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[c];});
+}
+function saaDMIUpdateSaveBtn() {
+  var inputs = document.querySelectorAll('#saa-dmi-table input[data-original]');
+  var dirty = [];
+  inputs.forEach(function(inp){if(inp.value!==inp.dataset.original)dirty.push(inp);});
+  var row = document.getElementById('saa-dmi-save-row');
+  var btn = document.getElementById('saa-dmi-save-btn');
+  if(dirty.length>0){row.style.display='';btn.textContent='Save ('+dirty.length+' changed)';}
+  else{row.style.display='none';}
+}
+function saaDMIRead() {
+  var status = document.getElementById('saa-dmi-status');
+  var table = document.getElementById('saa-dmi-table');
+  var saveRow = document.getElementById('saa-dmi-save-row');
+  status.textContent = 'Reading...';
+  status.style.color = 'var(--muted)';
+  table.innerHTML = '';
+  saveRow.style.display = 'none';
+  fetch('/api/tools/saa-dmi').then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});}).then(function(fields){
+    status.textContent = fields.length+' field(s) loaded.';
+    var rows = fields.map(function(f){
+      return '<tr>'
+        +'<td style="font-size:13px;white-space:nowrap;padding-right:8px">'+saaDMIEsc(f.name)+'</td>'
+        +'<td style="font-family:monospace;font-size:13px;white-space:nowrap;padding-right:8px">'+saaDMIEsc(f.shn)+'</td>'
+        +'<td><input type="text" value="'+saaDMIEsc(f.value)+'" data-shn="'+saaDMIEsc(f.shn)+'" data-original="'+saaDMIEsc(f.value)+'" oninput="saaDMIMarkDirty(this)" style="width:100%;font-family:monospace;font-size:13px;border:1px solid var(--line);padding:3px 6px;border-radius:3px"></td>'
+        +'<td id="saa-dmi-dirty-'+saaDMIEsc(f.shn)+'" style="font-size:12px;color:var(--warn,#b45309);width:50px;padding-left:6px"></td>'
+        +'</tr>';
+    }).join('');
+    table.innerHTML = '<table style="width:100%;border-collapse:collapse"><tr><th style="text-align:left;font-size:13px;padding-bottom:6px">Field</th><th style="text-align:left;font-size:13px;padding-bottom:6px">Shn</th><th style="text-align:left;font-size:13px;padding-bottom:6px">Value</th><th></th></tr>'+rows+'</table>';
+  }).catch(function(e){
+    status.textContent = 'Error: '+e.message;
+    status.style.color = 'var(--crit-fg,#9f3a38)';
+  });
+}
+function saaDMIMarkDirty(inp) {
+  var shn = inp.dataset.shn;
+  var cell = document.getElementById('saa-dmi-dirty-'+shn);
+  if(cell)cell.textContent = inp.value!==inp.dataset.original?'changed':'';
+  saaDMIUpdateSaveBtn();
+}
+function saaDMIWaitTask(taskID) {
+  var msg = document.getElementById('saa-dmi-save-msg');
+  msg.textContent = 'Task '+taskID+' queued...';
+  msg.style.color = 'var(--muted)';
+  var timer = setInterval(function(){
+    fetch('/api/tasks').then(function(r){return r.json();}).then(function(tasks){
+      var task = (tasks||[]).find(function(t){return t.id===taskID;});
+      if(!task)return;
+      if(task.status==='done'||task.status==='failed'||task.status==='cancelled'){
+        clearInterval(timer);
+        msg.textContent = task.status==='done'?'Saved. Reboot to apply.':'Failed: '+(task.error||task.status);
+        msg.style.color = task.status==='done'?'var(--ok,green)':'var(--crit-fg,#9f3a38)';
+        document.getElementById('saa-dmi-save-btn').disabled = false;
+      }
+    }).catch(function(){});
+  }, 1500);
+}
+function saaDMISave() {
+  var inputs = document.querySelectorAll('#saa-dmi-table input[data-original]');
+  var changes = [];
+  inputs.forEach(function(inp){if(inp.value!==inp.dataset.original)changes.push({shn:inp.dataset.shn,value:inp.value});});
+  if(!changes.length)return;
+  var names = changes.map(function(c){return c.shn;}).join(', ');
+  if(!window.confirm('Apply DMI changes for: '+names+'?\n\nThe server will need to be rebooted for changes to take effect.'))return;
+  var btn = document.getElementById('saa-dmi-save-btn');
+  var msg = document.getElementById('saa-dmi-save-msg');
+  btn.disabled = true;
+  msg.textContent = 'Submitting...';
+  msg.style.color = 'var(--muted)';
+  fetch('/api/tools/saa-dmi/write',{
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body:JSON.stringify({changes:changes})
+  }).then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});}).then(function(d){
+    saaDMIWaitTask(d.task_id);
+  }).catch(function(e){
+    msg.textContent = 'Error: '+e.message;
+    msg.style.color = 'var(--crit-fg,#9f3a38)';
+    btn.disabled = false;
+  });
+}
+</script>
+</div></div>`
+}
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
 // At metricsCollectInterval = 5 s this covers 30 minutes of live history.
 const metricsChartWindow = 360

+// metricsDownsampleAge is the age after which old metrics rows are downsampled
+// to 1 sample per minute. Data fresher than this is kept at full resolution.
+const metricsDownsampleAge = 2 * time.Hour
+
+// metricsRetainWindow is the total retention period for metrics rows.
+// Rows older than this are deleted entirely by the background compactor.
+const metricsRetainWindow = 48 * time.Hour
+
 var metricsCollectInterval = 5 * time.Second

 // pendingNetChange tracks a network state change awaiting confirmation.
@@ -213,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
 		h.kmsg = newKmsgWatcher(opts.App.StatusDB)
 		h.kmsg.start()
 		globalQueue.kmsgWatcher = h.kmsg
+
+		// Start periodic health poller for components that don't emit kernel log events (e.g. PSU).
+		if opts.App.StatusDB != nil {
+			newHealthPoller(opts.App.StatusDB).start()
+		}
 	}

 	globalQueue.startWorker(&opts)
@@ -263,6 +276,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
+	mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
+	mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
 	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)

 	// Tasks
@@ -291,11 +306,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Export
 	mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
 	mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
-	mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
-	mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
+	mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
+	mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
+	mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)

 	// Tools
 	mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
+	mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
+	mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
+	mux.HandleFunc("GET /api/tools/saa-dmi", h.handleAPISAADMIRead)
+	mux.HandleFunc("POST /api/tools/saa-dmi/write", h.handleAPISAADMIWrite)

 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
@@ -315,6 +335,10 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
 	mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)

+	// Hardware component detail (fragment for modal in Hardware Summary card)
+	mux.HandleFunc("GET /api/hardware-summary", h.handleAPIHardwareSummary)
+	mux.HandleFunc("GET /api/components/{type}", h.handleAPIComponentDetail)
+
 	// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
 	mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
 	mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
@@ -335,13 +359,24 @@ func (h *handler) startMetricsCollector() {
 	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
-		for range ticker.C {
-			sample := platform.SampleLiveMetrics()
-			if h.metricsDB != nil {
-				_ = h.metricsDB.Write(sample)
+		pruneTicker := time.NewTicker(time.Hour)
+		defer pruneTicker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				sample := platform.SampleLiveMetrics()
+				if h.metricsDB != nil {
+					_ = h.metricsDB.Write(sample)
+				}
+				h.feedRings(sample)
+				h.setLatestMetric(sample)
+			case <-pruneTicker.C:
+				if h.metricsDB != nil {
+					now := time.Now().UTC()
+					_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
+					_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
+				}
 			}
-			h.feedRings(sample)
-			h.setLatestMetric(sample)
 		}
 	})
 }
@@ -550,6 +585,7 @@ func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {

 func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
 	snapshot, _ := loadSnapshot(h.opts.AuditPath)
+	snapshot = enrichSnapshotForViewer(snapshot)
 	body, err := viewer.RenderHTML(snapshot, h.opts.Title)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
@@ -575,12 +611,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	}
 	timeline := metricsTimelineSegments(samples, time.Now())
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
-		buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		var overviewOk bool
+		var buf []byte
+		buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 			return
 		}
-		if !ok {
+		if !overviewOk {
 			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 			return
 		}
@@ -589,23 +627,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		_, _ = w.Write(buf)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}

-	buf, err := renderMetricChartSVG(
-		title,
-		labels,
-		sampleTimes(samples),
-		datasets,
-		names,
-		yMin,
-		yMax,
-		chartCanvasHeightForPath(path, len(names)),
-		timeline,
-	)
+	var buf []byte
+	if stacked {
+		buf, err = renderStackedMetricChartSVG(
+			title,
+			labels,
+			sampleTimes(samples),
+			datasets,
+			names,
+			yMax,
+			chartCanvasHeightForPath(path, len(names)),
+			timeline,
+		)
+	} else {
+		buf, err = renderMetricChartSVG(
+			title,
+			labels,
+			sampleTimes(samples),
+			datasets,
+			names,
+			yMin,
+			yMax,
+			chartCanvasHeightForPath(path, len(names)),
+			timeline,
+		)
+	}
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -615,12 +667,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }

-func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
-	var datasets [][]float64
-	var names []string
-	var title string
-	var yMin, yMax *float64
-	labels := sampleTimeLabels(samples)
+func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
+	labels = sampleTimeLabels(samples)

 	switch {
 	case path == "server-load":
@@ -657,12 +705,19 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case path == "server-power":
 		title = "System Power"
 		power := make([]float64, len(samples))
+		label := "Power W"
 		for i, s := range samples {
 			power[i] = s.PowerW
+			if strings.TrimSpace(s.PowerSource) != "" {
+				label = fmt.Sprintf("Power W · %s", s.PowerSource)
+				if strings.TrimSpace(s.PowerMode) != "" {
+					label += fmt.Sprintf(" (%s)", s.PowerMode)
+				}
+			}
 		}
 		power = normalizePowerSeries(power)
 		datasets = [][]float64{power}
-		names = []string{"Power W"}
+		names = []string{label}
 		yMin = floatPtr(0)
 		yMax = autoMax120(power)

@@ -707,7 +762,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case strings.HasPrefix(path, "gpu/"):
 		idx, sub, ok := parseGPUChartPath(path)
 		if !ok {
-			return nil, nil, nil, "", nil, nil, false
+			return nil, nil, nil, "", nil, nil, false, false
 		}
 		switch sub {
 		case "load":
@@ -715,7 +770,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
 			names = []string{"Load %", "Mem %"}
@@ -725,7 +780,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{temp}
 			names = []string{"Temp °C"}
@@ -735,7 +790,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Core Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Core Clock MHz"}
@@ -744,7 +799,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Memory Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Memory Clock MHz"}
@@ -753,7 +808,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{power}
 			names = []string{"Power W"}
@@ -761,10 +816,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		}

 	default:
-		return nil, nil, nil, "", nil, nil, false
+		return nil, nil, nil, "", nil, nil, false, false
 	}

-	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
+	return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
 }

 func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
@@ -930,6 +985,37 @@ func normalizePowerSeries(ds []float64) []float64 {
 	return out
 }

+// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
+func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
+	seen := map[int]struct{}{}
+	for _, s := range samples {
+		for _, p := range s.PSUs {
+			seen[p.Slot] = struct{}{}
+		}
+	}
+	slots := make([]int, 0, len(seen))
+	for s := range seen {
+		slots = append(slots, s)
+	}
+	sort.Ints(slots)
+	return slots
+}
+
+// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
+func psuStackedTotal(datasets [][]float64) []float64 {
+	if len(datasets) == 0 {
+		return nil
+	}
+	n := len(datasets[0])
+	total := make([]float64, n)
+	for _, ds := range datasets {
+		for i, v := range ds {
+			total[i] += v
+		}
+	}
+	return total
+}
+
 func normalizeFanSeries(ds []float64) []float64 {
 	if len(ds) == 0 {
 		return nil
@@ -1219,8 +1305,8 @@ const loadingPageHTML = `<!DOCTYPE html>
 *{margin:0;padding:0;box-sizing:border-box}
 html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
 .wrap{text-align:center;width:420px}
-.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
-.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
+.brand{font-size:22px;letter-spacing:.18em;color:#f6c90e;margin-bottom:6px;text-align:left}
+.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px}
 .spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
 .spinner.hidden{display:none}
@keyframes spin{to{transform:rotate(360deg)}}
@@ -1238,12 +1324,7 @@ td:first-child{color:#718096;width:55%}
 </head>
 <body>
 <div class="wrap">
-  <div class="logo">  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗
-  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝
-  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗
-  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝
-  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗
-  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝</div>
+  <div class="brand">EASY BEE</div>
  <div class="subtitle">Hardware Audit LiveCD</div>
  <div class="spinner" id="spin"></div>
  <div class="status" id="st">Connecting to bee-web...</div>
@@ -1253,8 +1334,20 @@ td:first-child{color:#718096;width:55%}
 <script>
 (function(){
 var gone = false;
+var pollStarted = false;
+var fallbackOpenTimer = null;
+var AUTO_OPEN_DELAY_MS = 15000;
 function go(){ if(!gone){gone=true;window.location.replace('/');} }

+function scheduleFallbackOpen(){
+  if(fallbackOpenTimer!==null) return;
+  fallbackOpenTimer=setTimeout(function(){
+    document.getElementById('spin').className='spinner hidden';
+    document.getElementById('st').textContent='Startup checks are taking too long — opening app...';
+    go();
+  },AUTO_OPEN_DELAY_MS);
+}
+
 function icon(s){
  if(s==='active')   return '<span class="ok">&#9679; active</span>';
  if(s==='failed')   return '<span class="fail">&#10005; failed</span>';
@@ -1286,6 +1379,7 @@ function pollServices(){
      tbl.innerHTML=html;
      if(allSettled(svcs)){
        clearInterval(pollTimer);
+        if(fallbackOpenTimer!==null) clearTimeout(fallbackOpenTimer);
        document.getElementById('spin').className='spinner hidden';
        document.getElementById('st').textContent='Ready \u2014 opening...';
        setTimeout(go,800);
@@ -1300,8 +1394,12 @@ function probe(){
      if(r.ok){
        document.getElementById('st').textContent='bee-web running \u2014 checking services...';
        document.getElementById('btn').style.display='';
-        pollServices();
-        pollTimer=setInterval(pollServices,1500);
+        scheduleFallbackOpen();
+        if(!pollStarted){
+          pollStarted=true;
+          pollServices();
+          pollTimer=setInterval(pollServices,1500);
+        }
      } else {
        document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
        setTimeout(probe,500);
@@ -1323,13 +1421,16 @@ func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
 	if page == "" {
 		page = "dashboard"
 	}
-	// Redirect old routes to new names
+	// Redirect legacy routes to new named pages
 	switch page {
-	case "tests":
-		http.Redirect(w, r, "/validate", http.StatusMovedPermanently)
+	case "validate", "tests":
+		http.Redirect(w, r, "/check", http.StatusMovedPermanently)
 		return
-	case "burn-in":
-		http.Redirect(w, r, "/burn", http.StatusMovedPermanently)
+	case "burn", "burn-in":
+		http.Redirect(w, r, "/load", http.StatusMovedPermanently)
+		return
+	case "benchmark":
+		http.Redirect(w, r, "/speed", http.StatusMovedPermanently)
 		return
 	}
 	body := renderPage(page, h.opts)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 		},
 	}

-	datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 		},
 	}

-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 		},
 	}

-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
 	if !ok {
 		t.Fatal("gpu-all-clock returned ok=false")
 	}
@@ -420,6 +420,49 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
 	}
 }

+func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp: start,
+			PSUs: []platform.PSUReading{
+				{Slot: 1, PowerW: 120},
+				{Slot: 2, PowerW: 130},
+			},
+			PowerW:      250,
+			PowerSource: "sdr_psu_input",
+			PowerMode:   "autotuned",
+		},
+		{
+			Timestamp: start.Add(time.Minute),
+			PSUs: []platform.PSUReading{
+				{Slot: 1, PowerW: 140},
+				{Slot: 2, PowerW: 135},
+			},
+			PowerW:      275,
+			PowerSource: "sdr_psu_input",
+			PowerMode:   "autotuned",
+		},
+	}
+
+	datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
+	if !ok {
+		t.Fatal("expected server-power chart data")
+	}
+	if title != "System Power" {
+		t.Fatalf("title=%q", title)
+	}
+	if stacked {
+		t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
+	}
+	if len(datasets) != 1 || len(names) != 1 {
+		t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
+	}
+	if names[0] != "Power W · sdr_psu_input (autotuned)" {
+		t.Fatalf("names=%v", names)
+	}
+}
+
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -561,6 +604,25 @@ func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
 	}
 }

+func TestLoadingPageHasFallbackAutoOpen(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/loading", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`var AUTO_OPEN_DELAY_MS = 15000;`,
+		`function scheduleFallbackOpen(){`,
+		`Startup checks are taking too long — opening app...`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("loading page missing %q: %s", needle, body)
+		}
+	}
+}
+
 func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
@@ -628,31 +690,40 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
 	}
-	if !strings.Contains(body, `Export to USB`) {
-		t.Fatalf("tools page missing export to usb section: %s", body)
+	if !strings.Contains(body, `USB Black-Box`) {
+		t.Fatalf("tools page missing usb black-box section: %s", body)
 	}
-	if !strings.Contains(body, `Support Bundle</button>`) {
-		t.Fatalf("tools page missing support bundle usb button: %s", body)
+	if !strings.Contains(body, `/api/blackbox/status`) {
+		t.Fatalf("tools page missing black-box status api usage: %s", body)
+	}
+	if !strings.Contains(body, `NVMe Block Format`) {
+		t.Fatalf("tools page missing nvme block format section: %s", body)
+	}
+	if !strings.Contains(body, `/api/tools/nvme-formats`) || !strings.Contains(body, `/api/tools/nvme-format/run`) {
+		t.Fatalf("tools page missing nvme format api usage: %s", body)
 	}
 }

 func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
-	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/speed", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
-		`href="/benchmark"`,
+		`href="/speed"`,
 		`id="benchmark-gpu-list"`,
 		`/api/gpu/nvidia`,
 		`/api/bee-bench/nvidia/perf/run`,
 		`/api/bee-bench/nvidia/power/run`,
+		`/api/bee-bench/nvidia/autotune/run`,
+		`/api/bee-bench/nvidia/autotune/status`,
 		`benchmark-run-nccl`,
 		`Run Performance Benchmark`,
 		`Run Power / Thermal Fit`,
+		`Autotune`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -698,7 +769,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {

 	handler := NewHandler(HandlerOptions{ExportDir: exportDir})
 	rec := httptest.NewRecorder()
-	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/speed", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
@@ -720,34 +791,53 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	}
 }

-func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
+func TestCheckPageRendersGPUSelectionAndNvidiaCards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
-	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
-		`NVIDIA GPU Targeted Stress`,
-		`nvidia-targeted-stress`,
-		`controlled NVIDIA DCGM load`,
-		`<code>dcgmi diag targeted_stress</code>`,
 		`NVIDIA GPU Selection`,
-		`All NVIDIA validate tasks use only the GPUs selected here.`,
-		`Select All`,
 		`id="sat-gpu-list"`,
+		`Select All`,
+		`id="sat-btn-nvidia"`,
+		`NVIDIA Interconnect (NCCL)`,
+		`NVIDIA Bandwidth (NVBandwidth)`,
+		`Non-destructive`,
 	} {
 		if !strings.Contains(body, needle) {
-			t.Fatalf("validate page missing %q: %s", needle, body)
+			t.Fatalf("check page missing %q: %s", needle, body)
 		}
 	}
 }

-func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
+func TestCheckPageRendersNvidiaFabricCards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
-	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`NVIDIA Interconnect (NCCL)`,
+		`NVIDIA Bandwidth (NVBandwidth)`,
+		`nvbandwidth`,
+		`all_reduce_perf`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("check page missing %q: %s", needle, body)
+		}
+	}
+}
+
+func TestLoadPageRendersGoalBasedNVIDIACards(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/load", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
@@ -756,7 +846,6 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
 		`NCCL`,
-		`Validate → Stress mode`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
@@ -950,6 +1039,39 @@ func TestViewerRendersLatestSnapshot(t *testing.T) {
 	}
 }

+func TestViewerRendersDerivedStorageBlockFormat(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "audit.json")
+	body := `{
+	  "collected_at":"2026-04-29T00:05:00Z",
+	  "hardware":{
+	    "board":{"serial_number":"SERIAL-NEW"},
+	    "storage":[
+	      {
+	        "serial_number":"DISK-1",
+	        "model":"Test NVMe",
+	        "logical_block_size_bytes":512,
+	        "physical_block_size_bytes":4096,
+	        "metadata_bytes_per_block":8
+	      }
+	    ]
+	  }
+	}`
+	if err := os.WriteFile(path, []byte(body), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{AuditPath: path})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/viewer", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	if !strings.Contains(rec.Body.String(), "512&#43;8") {
+		t.Fatalf("viewer body missing derived block format: %s", rec.Body.String())
+	}
+}
+
 func TestAuditJSONServesLatestSnapshot(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
@@ -972,6 +1094,36 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
 	}
 }

+func TestAuditJSONDoesNotInjectDerivedStorageBlockFormat(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "audit.json")
+	body := `{
+	  "hardware":{
+	    "board":{"serial_number":"SERIAL-API"},
+	    "storage":[
+	      {
+	        "serial_number":"DISK-1",
+	        "logical_block_size_bytes":512,
+	        "metadata_bytes_per_block":8
+	      }
+	    ]
+	  }
+	}`
+	if err := os.WriteFile(path, []byte(body), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{AuditPath: path})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	if strings.Contains(rec.Body.String(), "block_format") {
+		t.Fatalf("audit.json should remain contract-only: %s", rec.Body.String())
+	}
+}
+
 func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
 	handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
 	rec := httptest.NewRecorder()
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -7,14 +7,43 @@ import (
 	"time"
 )

+const (
+	recoverLoopMaxDelay   = 60 * time.Second
+	recoverLoopResetAfter = 30 * time.Second
+)
+
+// goRecoverLoop starts fn in a goroutine, restarting after panics.
+// restartDelay is the initial delay; successive panics double it up to
+// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
+// successfully for recoverLoopResetAfter without panicking.
 func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
 	go func() {
+		delay := restartDelay
+		consecutive := 0
 		for {
-			if !runRecoverable(name, fn) {
+			start := time.Now()
+			panicked := runRecoverable(name, fn)
+			if !panicked {
 				return
 			}
-			if restartDelay > 0 {
-				time.Sleep(restartDelay)
+			consecutive++
+			if time.Since(start) >= recoverLoopResetAfter {
+				delay = restartDelay
+				consecutive = 1
+			}
+			slog.Warn("goroutine restarting after panic",
+				"component", name,
+				"consecutive_panics", consecutive,
+				"next_delay", delay,
+			)
+			if delay > 0 {
+				time.Sleep(delay)
+			}
+			if delay < recoverLoopMaxDelay {
+				delay *= 2
+				if delay > recoverLoopMaxDelay {
+					delay = recoverLoopMaxDelay
+				}
 			}
 		}
 	}()
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
 		}
 		return gpuDisplayLabel(idx) + " Overview", buf, true
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		return "", nil, false
 	}
-	buf, err := renderMetricChartSVG(
-		title,
-		labels,
-		sampleTimes(samples),
-		datasets,
-		names,
-		yMin,
-		yMax,
-		chartCanvasHeightForPath(path, len(names)),
-		timeline,
-	)
+	var buf []byte
+	var err error
+	if stacked {
+		buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
+	} else {
+		buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
+	}
 	if err != nil {
 		return "", nil, false
 	}
--- a/audit/internal/webui/task_runner.go
+++ b/audit/internal/webui/task_runner.go
@@ -0,0 +1,517 @@
+package webui
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+	"bee/audit/internal/runtimeenv"
+)
+
+type taskRunnerState struct {
+	PID       int       `json:"pid"`
+	Status    string    `json:"status"`
+	Error     string    `json:"error,omitempty"`
+	UpdatedAt time.Time `json:"updated_at"`
+}
+
+func taskRunnerStatePath(t *Task) string {
+	if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
+		return ""
+	}
+	return filepath.Join(t.ArtifactsDir, "runner-state.json")
+}
+
+func writeTaskRunnerState(t *Task, state taskRunnerState) error {
+	path := taskRunnerStatePath(t)
+	if path == "" {
+		return nil
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(state, "", "  ")
+	if err != nil {
+		return err
+	}
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, data, 0644); err != nil {
+		return err
+	}
+	return os.Rename(tmp, path)
+}
+
+func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
+	path := taskRunnerStatePath(t)
+	if path == "" {
+		return taskRunnerState{}, false
+	}
+	data, err := os.ReadFile(path)
+	if err != nil || len(data) == 0 {
+		return taskRunnerState{}, false
+	}
+	var state taskRunnerState
+	if err := json.Unmarshal(data, &state); err != nil {
+		return taskRunnerState{}, false
+	}
+	return state, true
+}
+
+func processAlive(pid int) bool {
+	if pid <= 0 {
+		return false
+	}
+	err := syscall.Kill(pid, 0)
+	return err == nil || err == syscall.EPERM
+}
+
+func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
+	now := time.Now()
+	t.DoneAt = &now
+	switch {
+	case cancelled:
+		t.Status = TaskCancelled
+		t.ErrMsg = "aborted"
+	case strings.TrimSpace(errMsg) != "":
+		t.Status = TaskFailed
+		t.ErrMsg = errMsg
+	default:
+		t.Status = TaskDone
+		t.ErrMsg = ""
+	}
+}
+
+func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
+	if opts == nil {
+		j.append("ERROR: handler options not configured")
+		j.finish("handler options not configured")
+		return
+	}
+	a := opts.App
+
+	recovered := len(j.lines) > 0
+	j.append(fmt.Sprintf("Starting %s...", t.Name))
+	if recovered {
+		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
+	}
+
+	var (
+		archive string
+		err     error
+	)
+
+	switch t.Target {
+	case "nvidia":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		diagLevel := 2
+		if t.params.StressMode {
+			diagLevel = 3
+		}
+		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
+			result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
+			if e != nil {
+				err = e
+			} else {
+				archive = result.Body
+			}
+		} else {
+			archive, err = a.RunNvidiaAcceptancePack("", j.append)
+		}
+	case "nvidia-targeted-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if dur <= 0 {
+			dur = 300
+		}
+		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-bench-perf":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			SizeMB:            t.params.SizeMB,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RunNCCL:           t.params.RunNCCL,
+			ParallelGPUs:      t.params.ParallelGPUs,
+			RampStep:          t.params.RampStep,
+			RampTotal:         t.params.RampTotal,
+			RampRunID:         t.params.RampRunID,
+		}, j.append)
+	case "nvidia-bench-power":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RampStep:          t.params.RampStep,
+			RampTotal:         t.params.RampTotal,
+			RampRunID:         t.params.RampRunID,
+		}, j.append)
+	case "nvidia-bench-autotune":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
+			Profile: t.params.BenchmarkProfile,
+			SizeMB:  t.params.SizeMB,
+		}, t.params.BenchmarkKind, j.append)
+	case "nvidia-compute":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+		if planErr != nil {
+			err = planErr
+			break
+		}
+		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+			dur = rampPlan.DurationSec
+		}
+		if rampPlan.StaggerSeconds > 0 {
+			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+		}
+		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
+	case "nvidia-targeted-power":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-pulse":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-bandwidth":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
+	case "nvidia-interconnect":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
+	case "nvidia-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+		if planErr != nil {
+			err = planErr
+			break
+		}
+		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+			dur = rampPlan.DurationSec
+		}
+		if rampPlan.StaggerSeconds > 0 {
+			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+		}
+		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
+			DurationSec:       dur,
+			Loader:            t.params.Loader,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			StaggerSeconds:    rampPlan.StaggerSeconds,
+		}, j.append)
+	case "memory":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
+		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
+	case "storage":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
+	case "cpu":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		if dur <= 0 {
+			if t.params.StressMode {
+				dur = 1800
+			} else {
+				dur = 60
+			}
+		}
+		j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
+		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
+	case "amd":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
+	case "amd-mem":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
+	case "amd-bandwidth":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
+	case "amd-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
+	case "memory-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
+	case "sat-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
+	case "platform-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
+		runOpts.Components = t.params.PlatformComponents
+		archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
+	case "audit":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		result, e := a.RunAuditNow(opts.RuntimeMode)
+		if e != nil {
+			err = e
+		} else {
+			for _, line := range splitLines(result.Body) {
+				j.append(line)
+			}
+		}
+	case "support-bundle":
+		j.append("Building support bundle...")
+		archive, err = buildSupportBundle(opts.ExportDir)
+	case "install":
+		if strings.TrimSpace(t.params.Device) == "" {
+			err = fmt.Errorf("device is required")
+			break
+		}
+		installLogPath := platform.InstallLogPath(t.params.Device)
+		j.append("Install log: " + installLogPath)
+		err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
+	case "install-to-ram":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		err = a.RunInstallToRAM(ctx, j.append)
+	case "nvme-format":
+		if strings.TrimSpace(t.params.Device) == "" {
+			err = fmt.Errorf("device is required")
+			break
+		}
+		err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
+	case "saa-dmi-write":
+		if len(t.params.SAADmiChanges) == 0 {
+			err = fmt.Errorf("no changes provided")
+			break
+		}
+		err = runSAADMIWriteTask(ctx, j, opts.ExportDir, t.params)
+	default:
+		j.append("ERROR: unknown target: " + t.Target)
+		j.finish("unknown target")
+		return
+	}
+
+	if archive != "" {
+		archivePath := app.ExtractArchivePath(archive)
+		if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
+			err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
+		}
+		if opts.App != nil && opts.App.StatusDB != nil {
+			app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
+		}
+	}
+
+	if err != nil {
+		if ctx.Err() != nil {
+			j.append("Aborted.")
+			j.finish("aborted")
+		} else {
+			j.append("ERROR: " + err.Error())
+			j.finish(err.Error())
+		}
+		return
+	}
+	if archive != "" {
+		j.append("Archive: " + archive)
+	}
+	j.finish("")
+}
+
+func loadPersistedTask(statePath, taskID string) (*Task, error) {
+	data, err := os.ReadFile(statePath)
+	if err != nil {
+		return nil, err
+	}
+	var persisted []persistedTask
+	if err := json.Unmarshal(data, &persisted); err != nil {
+		return nil, err
+	}
+	for _, pt := range persisted {
+		if pt.ID != taskID {
+			continue
+		}
+		t := &Task{
+			ID:             pt.ID,
+			Name:           pt.Name,
+			Target:         pt.Target,
+			Priority:       pt.Priority,
+			Status:         pt.Status,
+			CreatedAt:      pt.CreatedAt,
+			StartedAt:      pt.StartedAt,
+			DoneAt:         pt.DoneAt,
+			ErrMsg:         pt.ErrMsg,
+			LogPath:        pt.LogPath,
+			ArtifactsDir:   pt.ArtifactsDir,
+			ReportJSONPath: pt.ReportJSONPath,
+			ReportHTMLPath: pt.ReportHTMLPath,
+			params:         pt.Params,
+		}
+		ensureTaskReportPaths(t)
+		return t, nil
+	}
+	return nil, fmt.Errorf("task %s not found", taskID)
+}
+
+func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
+	if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
+		fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
+		return 2
+	}
+
+	runtimeInfo, err := runtimeenv.Detect("auto")
+	if err != nil {
+		slog.Warn("resolve runtime for task-run", "err", err)
+	}
+	opts := &HandlerOptions{
+		ExportDir:   exportDir,
+		App:         app.New(platform.New()),
+		RuntimeMode: runtimeInfo.Mode,
+	}
+	statePath := filepath.Join(exportDir, "tasks-state.json")
+	task, err := loadPersistedTask(statePath, taskID)
+	if err != nil {
+		fmt.Fprintln(stderr, err.Error())
+		return 1
+	}
+	if task.StartedAt == nil || task.StartedAt.IsZero() {
+		now := time.Now()
+		task.StartedAt = &now
+	}
+	if task.Status == "" {
+		task.Status = TaskRunning
+	}
+	if err := writeTaskRunnerState(task, taskRunnerState{
+		PID:       os.Getpid(),
+		Status:    TaskRunning,
+		UpdatedAt: time.Now().UTC(),
+	}); err != nil {
+		fmt.Fprintln(stderr, err.Error())
+		return 1
+	}
+
+	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
+	defer cancel()
+
+	j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
+	executeTaskWithOptions(opts, task, j, ctx)
+	finalizeTaskForResult(task, j.err, ctx.Err() != nil)
+	if err := writeTaskReportArtifacts(task); err != nil {
+		appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
+	}
+	j.closeLog()
+	if err := writeTaskRunnerState(task, taskRunnerState{
+		PID:       os.Getpid(),
+		Status:    task.Status,
+		Error:     task.ErrMsg,
+		UpdatedAt: time.Now().UTC(),
+	}); err != nil {
+		fmt.Fprintln(stderr, err.Error())
+	}
+	if task.ErrMsg != "" {
+		return 1
+	}
+	return 0
+}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"io"
 	"log/slog"
 	"net/http"
 	"os"
@@ -13,6 +14,7 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"syscall"
 	"time"

 	"bee/audit/internal/app"
@@ -34,6 +36,7 @@ var taskNames = map[string]string{
 	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
 	"nvidia-bench-perf":      "NVIDIA Bee Bench Perf",
 	"nvidia-bench-power":     "NVIDIA Bee Bench Power",
+	"nvidia-bench-autotune":  "NVIDIA Bee Bench Power Source Autotune",
 	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
 	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
 	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -54,6 +57,7 @@ var taskNames = map[string]string{
 	"support-bundle":         "Support Bundle",
 	"install":                "Install to Disk",
 	"install-to-ram":         "Install to RAM",
+	"nvme-format":            "NVMe Block Format Change",
 }

 // burnNames maps target → human-readable name when a burn profile is set.
@@ -109,8 +113,9 @@ type Task struct {
 	ReportHTMLPath string     `json:"report_html_path,omitempty"`

 	// runtime fields (not serialised)
-	job    *jobState
-	params taskParams
+	job       *jobState
+	runnerPID int
+	params    taskParams
 }

 // taskParams holds optional parameters parsed from the run request.
@@ -125,14 +130,17 @@ type taskParams struct {
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
+	BenchmarkKind      string   `json:"benchmark_kind,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	RampStep           int      `json:"ramp_step,omitempty"`
 	RampTotal          int      `json:"ramp_total,omitempty"`
 	RampRunID          string   `json:"ramp_run_id,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
-	Device             string   `json:"device,omitempty"` // for install
-	PlatformComponents []string `json:"platform_components,omitempty"`
+	Device             string      `json:"device,omitempty"` // for install
+	LBAF               int         `json:"lbaf,omitempty"`
+	PlatformComponents []string    `json:"platform_components,omitempty"`
+	SAADmiChanges      []saaChange `json:"saa_dmi_changes,omitempty"`
 }

 type persistedTask struct {
@@ -162,6 +170,32 @@ type nvidiaRampSpec struct {
 	TotalDurationSec int
 }

+func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
+	switch strings.TrimSpace(strings.ToLower(profile)) {
+	case "overnight":
+		return 1024, 2
+	case "acceptance":
+		return 1024, 1
+	case "smoke":
+		return 256, 1
+	}
+	if stress {
+		return 512, 1
+	}
+	return 256, 1
+}
+
+func taskMayLeaveOrphanWorkers(target string) bool {
+	switch strings.TrimSpace(strings.ToLower(target)) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
+		"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
+		return true
+	default:
+		return false
+	}
+}
+
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
@@ -300,6 +334,13 @@ var (
 	installCommand     = func(ctx context.Context, device string, logPath string) *exec.Cmd {
 		return exec.CommandContext(ctx, "bee-install", device, logPath)
 	}
+	externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
+		exe, err := os.Executable()
+		if err != nil {
+			return nil, err
+		}
+		return exec.Command(exe, "bee-worker", "--export-dir", exportDir, "--task-id", taskID), nil
+	}
 )

 // enqueue adds a task to the queue and notifies the worker.
@@ -337,6 +378,11 @@ func (q *taskQueue) prune() {

 // nextPending returns the highest-priority pending task (nil if none).
 func (q *taskQueue) nextPending() *Task {
+	for _, t := range q.tasks {
+		if t.Status == TaskRunning {
+			return nil
+		}
+	}
 	var best *Task
 	for _, t := range q.tasks {
 		if t.Status != TaskPending {
@@ -456,6 +502,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 	if !q.started {
 		q.loadLocked()
 		q.started = true
+		q.resumeRunningTasksLocked()
 		goRecoverLoop("task worker", 2*time.Second, q.worker)
 	}
 	hasPending := q.nextPending() != nil
@@ -489,15 +536,12 @@ func (q *taskQueue) worker() {
 				t.StartedAt = &now
 				t.DoneAt = nil
 				t.ErrMsg = ""
-				j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
+				j := newTaskJobState(t.LogPath)
 				t.job = j
 				q.persistLocked()
 				q.mu.Unlock()

-				taskCtx, taskCancel := context.WithCancel(context.Background())
-				j.cancel = taskCancel
-				q.executeTask(t, j, taskCtx)
-				taskCancel()
+				q.runTaskExternal(t, j)

 				q.mu.Lock()
 				q.prune()
@@ -509,6 +553,218 @@ func (q *taskQueue) worker() {
 	}
 }

+func (q *taskQueue) resumeRunningTasksLocked() {
+	for _, t := range q.tasks {
+		if t.Status != TaskRunning {
+			continue
+		}
+		if t.job == nil {
+			t.job = newTaskJobState(t.LogPath)
+		}
+		q.attachExternalTaskControlsLocked(t, t.job)
+		q.startRecoveredTaskMonitorLocked(t, t.job)
+	}
+}
+
+func (q *taskQueue) attachExternalTaskControlsLocked(t *Task, j *jobState) {
+	if t == nil || j == nil {
+		return
+	}
+	j.cancel = func() {
+		pid := t.runnerPID
+		if pid <= 0 {
+			if state, ok := readTaskRunnerState(t); ok {
+				pid = state.PID
+			}
+		}
+		if pid > 0 {
+			_ = syscall.Kill(pid, syscall.SIGTERM)
+		}
+	}
+}
+
+func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
+	if t == nil || j == nil || t.runnerPID <= 0 {
+		return
+	}
+	goRecoverOnce("task runner monitor", func() {
+		stopTail := make(chan struct{})
+		doneTail := make(chan struct{})
+		go q.followTaskLog(t, j, stopTail, doneTail)
+		for processAlive(t.runnerPID) {
+			time.Sleep(500 * time.Millisecond)
+		}
+		close(stopTail)
+		<-doneTail
+		q.finishExternalTask(t, j, nil)
+	})
+}
+
+func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
+	startedKmsgWatch := false
+	if q.kmsgWatcher != nil && isSATTarget(t.Target) {
+		q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
+		startedKmsgWatch = true
+	}
+	defer func() {
+		if startedKmsgWatch && q.kmsgWatcher != nil {
+			q.kmsgWatcher.NotifyTaskFinished(t.ID)
+		}
+	}()
+
+	stopTail := make(chan struct{})
+	doneTail := make(chan struct{})
+	defer func() {
+		close(stopTail)
+		<-doneTail
+	}()
+	go q.followTaskLog(t, j, stopTail, doneTail)
+
+	cmd, err := externalTaskRunnerCommand(q.opts.ExportDir, t.ID)
+	if err != nil {
+		j.appendFromLog("ERROR: " + err.Error())
+		q.finishExternalTask(t, j, err)
+		return
+	}
+	if err := cmd.Start(); err != nil {
+		j.appendFromLog("ERROR: " + err.Error())
+		q.finishExternalTask(t, j, err)
+		return
+	}
+
+	q.mu.Lock()
+	t.runnerPID = cmd.Process.Pid
+	q.attachExternalTaskControlsLocked(t, j)
+	q.persistLocked()
+	q.mu.Unlock()
+
+	waitErr := cmd.Wait()
+	time.Sleep(200 * time.Millisecond)
+	q.finishExternalTask(t, j, waitErr)
+}
+
+func (q *taskQueue) followTaskLog(t *Task, j *jobState, stop <-chan struct{}, done chan<- struct{}) {
+	defer close(done)
+	path := ""
+	if t != nil {
+		path = t.LogPath
+	}
+	if strings.TrimSpace(path) == "" {
+		return
+	}
+	offset := int64(0)
+	if info, err := os.Stat(path); err == nil {
+		offset = info.Size()
+	}
+	var partial string
+	ticker := time.NewTicker(250 * time.Millisecond)
+	defer ticker.Stop()
+	flush := func() {
+		data, newOffset, err := readTaskLogDelta(path, offset)
+		if err != nil || len(data) == 0 {
+			offset = newOffset
+			return
+		}
+		offset = newOffset
+		text := partial + strings.ReplaceAll(string(data), "\r\n", "\n")
+		lines := strings.Split(text, "\n")
+		partial = lines[len(lines)-1]
+		for _, line := range lines[:len(lines)-1] {
+			if line == "" {
+				continue
+			}
+			j.appendFromLog(line)
+		}
+	}
+	for {
+		select {
+		case <-ticker.C:
+			flush()
+		case <-stop:
+			flush()
+			if strings.TrimSpace(partial) != "" {
+				j.appendFromLog(partial)
+			}
+			return
+		}
+	}
+}
+
+func readTaskLogDelta(path string, offset int64) ([]byte, int64, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, offset, err
+	}
+	defer f.Close()
+	info, err := f.Stat()
+	if err != nil {
+		return nil, offset, err
+	}
+	if info.Size() < offset {
+		offset = 0
+	}
+	if _, err := f.Seek(offset, io.SeekStart); err != nil {
+		return nil, offset, err
+	}
+	data, err := io.ReadAll(io.LimitReader(f, 1<<20))
+	return data, offset + int64(len(data)), err
+}
+
+func (q *taskQueue) finishExternalTask(t *Task, j *jobState, waitErr error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if t.Status == TaskDone || t.Status == TaskFailed || t.Status == TaskCancelled {
+		if j != nil && !j.isDone() {
+			j.finish(t.ErrMsg)
+			j.closeLog()
+		}
+		select {
+		case q.trigger <- struct{}{}:
+		default:
+		}
+		return
+	}
+
+	state, ok := readTaskRunnerState(t)
+	switch {
+	case ok && state.Status != TaskRunning:
+		t.Status = state.Status
+		t.ErrMsg = state.Error
+		now := state.UpdatedAt
+		if now.IsZero() {
+			now = time.Now()
+		}
+		t.DoneAt = &now
+	case waitErr != nil:
+		now := time.Now()
+		t.Status = TaskFailed
+		t.ErrMsg = waitErr.Error()
+		t.DoneAt = &now
+	default:
+		now := time.Now()
+		t.Status = TaskFailed
+		t.ErrMsg = "task runner exited without final state"
+		t.DoneAt = &now
+	}
+	t.runnerPID = 0
+	q.finalizeTaskArtifactPathsLocked(t)
+	q.persistLocked()
+
+	if j != nil && !j.isDone() {
+		j.finish(t.ErrMsg)
+		j.closeLog()
+	}
+	if t.ErrMsg != "" {
+		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
+	} else {
+		taskSerialEvent(t, "finished with status="+t.Status)
+	}
+	select {
+	case q.trigger <- struct{}{}:
+	default:
+	}
+}
+
 func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
 	startedKmsgWatch := false
 	defer q.finalizeTaskRun(t, j)
@@ -559,6 +815,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 	if err := writeTaskReportArtifacts(t); err != nil {
 		appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
 	}
+	j.closeLog()
 	if t.ErrMsg != "" {
 		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
 		return
@@ -587,8 +844,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	}
 	a := q.opts.App

+	recovered := len(j.lines) > 0
 	j.append(fmt.Sprintf("Starting %s...", t.Name))
-	if len(j.lines) > 0 {
+	if recovered {
 		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
 	}

@@ -658,6 +916,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
+	case "nvidia-bench-autotune":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
+			Profile: t.params.BenchmarkProfile,
+			SizeMB:  t.params.SizeMB,
+		}, t.params.BenchmarkKind, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -710,15 +977,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
-		if t.params.BurnProfile != "" && dur <= 0 {
-			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
-		}
-		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
-			DurationSec: dur,
-			Loader:      platform.NvidiaStressLoaderNCCL,
-			GPUIndices:  t.params.GPUIndices,
-		}, j.append)
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -751,10 +1010,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		sizeMB, passes := 256, 1
-		if t.params.StressMode {
-			sizeMB, passes = 1024, 3
-		}
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
 		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
@@ -956,15 +1213,11 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	case TaskRunning:
-		if t.job != nil {
-			t.job.abort()
+		if t.job == nil || !t.job.abort() {
+			writeError(w, http.StatusConflict, "task is not cancellable")
+			return
 		}
-		t.Status = TaskCancelled
-		now := time.Now()
-		t.DoneAt = &now
-		globalQueue.persistLocked()
-		taskSerialEvent(t, "finished with status="+t.Status)
-		writeJSON(w, map[string]string{"status": "cancelled"})
+		writeJSON(w, map[string]string{"status": "aborting"})
 	default:
 		writeError(w, http.StatusConflict, "task is not running or pending")
 	}
@@ -1010,9 +1263,6 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			if t.job != nil {
 				t.job.abort()
 			}
-			t.Status = TaskCancelled
-			t.DoneAt = &now
-			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		}
 	}
@@ -1037,6 +1287,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1140,15 +1393,29 @@ func (q *taskQueue) loadLocked() {
 		}
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
-			// The task was interrupted by a bee-web restart. Child processes
-			// (e.g. bee-gpu-burn-worker) survive the restart in their own
-			// process groups and cannot be cancelled retroactively. Mark the
-			// task as failed so the user can decide whether to re-run it
-			// rather than blindly re-launching duplicate workers.
-			now := time.Now()
-			t.Status = TaskFailed
-			t.DoneAt = &now
-			t.ErrMsg = "interrupted by bee-web restart"
+			state, ok := readTaskRunnerState(t)
+			switch {
+			case ok && state.Status == TaskRunning && processAlive(state.PID):
+				t.runnerPID = state.PID
+				t.job = newTaskJobState(t.LogPath)
+			case ok && state.Status != TaskRunning:
+				t.runnerPID = state.PID
+				t.Status = state.Status
+				t.ErrMsg = state.Error
+				now := state.UpdatedAt
+				if now.IsZero() {
+					now = time.Now()
+				}
+				t.DoneAt = &now
+			default:
+				if taskMayLeaveOrphanWorkers(t.Target) {
+					_ = platform.KillTestWorkers()
+				}
+				now := time.Now()
+				t.Status = TaskFailed
+				t.DoneAt = &now
+				t.ErrMsg = "interrupted by bee-web restart"
+			}
 		} else if t.Status == TaskPending {
 			t.StartedAt = nil
 			t.DoneAt = nil
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -126,6 +126,23 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
 	}
 }

+func TestJobAppendFlushesTaskLogImmediately(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "task.log")
+	j := newTaskJobState(path)
+
+	j.append("live-line")
+
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if string(data) != "live-line\n" {
+		t.Fatalf("log=%q want live-line newline", string(data))
+	}
+	j.closeLog()
+}
+
 func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
 	now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
 	q := &taskQueue{
@@ -672,6 +689,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
 	}
 }

+func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
+	var gotSizeMB, gotPasses int
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "mem-validate-1",
+		Name:      "Memory SAT",
+		Target:    "memory",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{StressMode: true},
+	}
+	j := &jobState{}
+
+	orig := runMemoryAcceptancePackCtx
+	runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
+		gotSizeMB = sizeMB
+		gotPasses = passes
+		return "/tmp/memory-validate.tar.gz", nil
+	}
+	defer func() { runMemoryAcceptancePackCtx = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotSizeMB != 512 || gotPasses != 1 {
+		t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
+	}
+}
+
 func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
@@ -819,3 +866,82 @@ func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
 		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
 	}
 }
+
+func TestRunTaskExternalOpensAndClosesKmsgWindow(t *testing.T) {
+	dir := t.TempDir()
+	releasePath := filepath.Join(dir, "release")
+	readyPath := filepath.Join(dir, "ready")
+	q := &taskQueue{
+		opts:        &HandlerOptions{ExportDir: dir},
+		logsDir:     filepath.Join(dir, "tasks"),
+		kmsgWatcher: newKmsgWatcher(nil),
+		trigger:     make(chan struct{}, 1),
+	}
+	if err := os.MkdirAll(q.logsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	tk := &Task{
+		ID:        "cpu-external-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}
+	q.assignTaskLogPathLocked(tk)
+	j := newTaskJobState(tk.LogPath)
+
+	orig := externalTaskRunnerCommand
+	externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
+		script := "printf ready > \"$1\"; while [ ! -f \"$2\" ]; do sleep 0.05; done"
+		return exec.Command("sh", "-c", script, "sh", readyPath, releasePath), nil
+	}
+	defer func() { externalTaskRunnerCommand = orig }()
+
+	done := make(chan struct{})
+	go func() {
+		q.runTaskExternal(tk, j)
+		close(done)
+	}()
+
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		if _, err := os.Stat(readyPath); err == nil {
+			break
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	if _, err := os.Stat(readyPath); err != nil {
+		t.Fatalf("external runner did not start: %v", err)
+	}
+
+	q.kmsgWatcher.mu.Lock()
+	activeCount := q.kmsgWatcher.activeCount
+	window := q.kmsgWatcher.window
+	q.kmsgWatcher.mu.Unlock()
+	if activeCount != 1 {
+		t.Fatalf("activeCount while running=%d want 1", activeCount)
+	}
+	if window == nil || len(window.targets) != 1 || window.targets[0] != "cpu" {
+		t.Fatalf("window while running=%+v", window)
+	}
+
+	if err := os.WriteFile(releasePath, []byte("1\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("runTaskExternal did not return")
+	}
+
+	q.kmsgWatcher.mu.Lock()
+	activeCount = q.kmsgWatcher.activeCount
+	window = q.kmsgWatcher.window
+	q.kmsgWatcher.mu.Unlock()
+	if activeCount != 0 {
+		t.Fatalf("activeCount after finish=%d want 0", activeCount)
+	}
+	if window != nil {
+		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
+	}
+}
--- a/audit/internal/webui/viewer_snapshot.go
+++ b/audit/internal/webui/viewer_snapshot.go
@@ -0,0 +1,62 @@
+package webui
+
+import (
+	"encoding/json"
+	"strconv"
+)
+
+func enrichSnapshotForViewer(snapshot []byte) []byte {
+	if len(snapshot) == 0 {
+		return snapshot
+	}
+	var root map[string]any
+	if err := json.Unmarshal(snapshot, &root); err != nil {
+		return snapshot
+	}
+	hardware, _ := root["hardware"].(map[string]any)
+	if len(hardware) == 0 {
+		return snapshot
+	}
+	storage, _ := hardware["storage"].([]any)
+	if len(storage) == 0 {
+		return snapshot
+	}
+	changed := false
+	for _, item := range storage {
+		row, _ := item.(map[string]any)
+		if len(row) == 0 {
+			continue
+		}
+		if _, exists := row["block_format"]; exists {
+			continue
+		}
+		logical, okLogical := jsonNumberToInt64(row["logical_block_size_bytes"])
+		metadata, okMetadata := jsonNumberToInt64(row["metadata_bytes_per_block"])
+		if !okLogical || !okMetadata || logical <= 0 || metadata < 0 {
+			continue
+		}
+		row["block_format"] = strconv.FormatInt(logical, 10) + "+" + strconv.FormatInt(metadata, 10)
+		changed = true
+	}
+	if !changed {
+		return snapshot
+	}
+	out, err := json.Marshal(root)
+	if err != nil {
+		return snapshot
+	}
+	return out
+}
+
+func jsonNumberToInt64(v any) (int64, bool) {
+	switch x := v.(type) {
+	case float64:
+		return int64(x), true
+	case int64:
+		return x, true
+	case int:
+		return int64(x), true
+	default:
+		return 0, false
+	}
+}
--- a/2
+++ b/2
--- a/bible-local/README.md
+++ b/bible-local/README.md
@@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`.
 |---|---|
 | `architecture/system-overview.md` | What bee does, scope, tech stack |
 | `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
+| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
 | `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
-| `decisions/` | Architectural decision log |
+| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
+| `decisions/` | Architectural decision log, including read-only submodule policy |
+
+## Validate Test Matrix
+
+### Validate
+
+- CPU check
+  - `lscpu`
+  - `sensors`
+  - `stress-ng`
+- Memory check
+  - `free`
+  - `timeout <timeout_sec> memtester`
+  - `free`
+- NVMe storage check
+  - `nvme id-ctrl`
+  - `nvme smart-log`
+  - `nvme device-self-test`
+- SATA/SAS storage check
+  - `smartctl -H -A`
+  - `smartctl -t short`
+- Basic NVIDIA GPU check
+  - `nvidia-smi -pm 1`
+  - `nvidia-smi -q`
+  - `dmidecode -t baseboard`
+  - `dmidecode -t system`
+  - `dcgmi diag -r 2`
+- Inter-GPU communication check
+  - `all_reduce_perf`
+- GPU bandwidth check
+  - `dcgmi diag -r nvbandwidth`
+
+### Validate -> Stress
+
+- Extended NVIDIA GPU check
+  - `nvidia-smi -pm 1`
+  - `nvidia-smi -q`
+  - `dmidecode -t baseboard`
+  - `dmidecode -t system`
+  - `dcgmi diag -r 3`
+- NVIDIA targeted stress
+  - `nvidia-smi -pm 1`
+  - `nvidia-smi -q`
+  - `dcgmi diag -r targeted_stress`
+- NVIDIA targeted power
+  - `nvidia-smi -pm 1`
+  - `nvidia-smi -q`
+  - `dcgmi diag -r targeted_power`
+- NVIDIA pulse test
+  - `nvidia-smi -pm 1`
+  - `nvidia-smi -q`
+  - `dcgmi diag -r pulse_test`
+- Inter-GPU communication check
+  - `all_reduce_perf`
+- GPU bandwidth check
+  - `dcgmi diag -r nvbandwidth`
--- a/bible-local/architecture/api-surface.md
+++ b/bible-local/architecture/api-surface.md
@@ -0,0 +1,185 @@
+# API Surface
+
+HTTP endpoints exposed by `bee web` (binds `0.0.0.0:80`).
+Handler registration: `audit/internal/webui/server.go` → `NewHandler()`.
+
+---
+
+## Health & readiness
+
+| Method | Path           | Description                                         |
+|--------|----------------|-----------------------------------------------------|
+| GET    | `/healthz`     | Always 200. Used by load balancers / boot scripts.  |
+| GET    | `/api/ready`   | 200 when audit JSON exists and is readable.         |
+| GET    | `/loading`     | HTML loading page shown before first audit.        |
+
+---
+
+## Audit
+
+| Method | Path                  | Description                                                  |
+|--------|-----------------------|--------------------------------------------------------------|
+| GET    | `/audit.json`         | Latest audit JSON with SAT overlay applied.                  |
+| GET    | `/runtime-health.json`| Latest runtime preflight JSON.                               |
+| POST   | `/api/audit/run`      | Enqueue a full `bee audit` run. Returns task ID.             |
+| GET    | `/api/audit/stream`   | SSE: audit run log lines (`data:` + newline per line).       |
+| GET    | `/api/preflight`      | Run runtime preflight check (synchronous, returns JSON).     |
+| GET    | `/api/hardware-summary` | Hardware health summary (status counts + failures).        |
+| GET    | `/api/components/{type}` | HTML fragment for component detail dialog (e.g. `cpu`, `memory`, `storage`, `pcie`). |
+
+---
+
+## SAT (System Acceptance Testing)
+
+All SAT run endpoints enqueue an async task. Response: `{"task_id": "..."}`.
+
+| Method | Path                                       | Description                       |
+|--------|--------------------------------------------|-----------------------------------|
+| POST   | `/api/sat/nvidia/run`                      | NVIDIA DCGM SAT                   |
+| POST   | `/api/sat/nvidia-targeted-stress/run`      | NVIDIA targeted stress validate   |
+| POST   | `/api/sat/nvidia-compute/run`              | NVIDIA max compute load           |
+| POST   | `/api/sat/nvidia-targeted-power/run`       | NVIDIA targeted power             |
+| POST   | `/api/sat/nvidia-pulse/run`                | NVIDIA pulse test                 |
+| POST   | `/api/sat/nvidia-interconnect/run`         | NCCL all_reduce_perf              |
+| POST   | `/api/sat/nvidia-bandwidth/run`            | NVBandwidth test                  |
+| POST   | `/api/sat/nvidia-stress/run`               | NVIDIA stress pack                |
+| POST   | `/api/sat/memory/run`                      | Memory acceptance                 |
+| POST   | `/api/sat/storage/run`                     | Storage acceptance (smartctl)     |
+| POST   | `/api/sat/cpu/run`                         | CPU acceptance (stress-ng)        |
+| POST   | `/api/sat/amd/run`                         | AMD GPU SAT (ROCm)                |
+| POST   | `/api/sat/amd-mem/run`                     | AMD memory integrity + bandwidth  |
+| POST   | `/api/sat/amd-bandwidth/run`               | AMD memory bandwidth              |
+| POST   | `/api/sat/amd-stress/run`                  | AMD GPU stress                    |
+| POST   | `/api/sat/memory-stress/run`               | Memory stress                     |
+| POST   | `/api/sat/sat-stress/run`                  | Combined storage+memory stress    |
+| POST   | `/api/sat/platform-stress/run`             | Fan + thermal stress              |
+| GET    | `/api/sat/stream`                          | SSE: live SAT log stream          |
+| POST   | `/api/sat/abort`                           | Abort the running SAT task        |
+
+---
+
+## Benchmarks
+
+| Method | Path                                    | Description                                  |
+|--------|-----------------------------------------|----------------------------------------------|
+| POST   | `/api/bee-bench/nvidia/perf/run`        | NVIDIA performance benchmark                 |
+| POST   | `/api/bee-bench/nvidia/power/run`       | NVIDIA power benchmark                       |
+| POST   | `/api/bee-bench/nvidia/autotune/run`    | Power source autotune (prerequisite for benchmarks) |
+| GET    | `/api/bee-bench/nvidia/autotune/status` | Current autotune result / status             |
+| GET    | `/api/benchmark/results`               | List completed benchmark result archives      |
+
+---
+
+## Tasks (async job queue)
+
+| Method | Path                        | Description                                        |
+|--------|-----------------------------|----------------------------------------------------|
+| GET    | `/api/tasks`                | List all tasks with status                         |
+| POST   | `/api/tasks/cancel-all`     | Cancel all pending/running tasks                   |
+| POST   | `/api/tasks/kill-workers`   | Force-kill worker goroutines                       |
+| POST   | `/api/tasks/{id}/cancel`    | Cancel a specific task                             |
+| POST   | `/api/tasks/{id}/priority`  | Elevate task priority                              |
+| GET    | `/api/tasks/{id}/stream`    | SSE: live log stream for a task                    |
+| GET    | `/api/tasks/{id}/charts`    | List chart names for a task                        |
+| GET    | `/api/tasks/{id}/chart/`    | SVG chart for a task result                        |
+| GET    | `/tasks/{id}`               | HTML task detail page                              |
+
+---
+
+## Services
+
+| Method | Path                      | Description                                      |
+|--------|---------------------------|--------------------------------------------------|
+| GET    | `/api/services`           | List bee-* systemd services and their states     |
+| POST   | `/api/services/action`    | start/stop/restart a service                     |
+
+---
+
+## Network
+
+| Method | Path                       | Description                                         |
+|--------|----------------------------|-----------------------------------------------------|
+| GET    | `/api/network`             | List interfaces with state and IPv4 addresses       |
+| POST   | `/api/network/dhcp`        | Run dhclient on one or all interfaces               |
+| POST   | `/api/network/static`      | Set static IPv4 address                             |
+| POST   | `/api/network/toggle`      | Bring interface up or down                          |
+| POST   | `/api/network/confirm`     | Confirm pending network change (clears rollback)   |
+| POST   | `/api/network/rollback`    | Restore pre-change network snapshot                 |
+
+---
+
+## Export
+
+| Method | Path                          | Description                                       |
+|--------|-------------------------------|---------------------------------------------------|
+| GET    | `/export/support.tar.gz`      | Download support bundle (live-generated)          |
+| GET    | `/export/file`                | Download a file from the export dir by path param |
+| GET    | `/export/`                    | Browse export dir (HTML index)                    |
+| GET    | `/api/export/list`            | JSON list of files in export dir                  |
+| GET    | `/api/export/usb`             | List removable USB targets available for export   |
+
+---
+
+## GPU
+
+| Method | Path                       | Description                                        |
+|--------|----------------------------|----------------------------------------------------|
+| GET    | `/api/gpu/presence`        | `{"nvidia": bool, "amd": bool}`                    |
+| GET    | `/api/gpu/nvidia`          | List NVIDIA GPUs from nvidia-smi                   |
+| GET    | `/api/gpu/nvidia-status`   | Per-GPU status (ECC, power, throttle)              |
+| POST   | `/api/gpu/nvidia-reset`    | GPU reset by index                                 |
+| GET    | `/api/gpu/tools`           | nvidia-smi / rocm-smi tool availability            |
+
+---
+
+## System
+
+| Method | Path                         | Description                                       |
+|--------|------------------------------|---------------------------------------------------|
+| GET    | `/api/system/ram-status`     | toram boot state and ISO copy status              |
+| POST   | `/api/system/install-to-ram` | Copy ISO to RAM (background task)                 |
+| GET    | `/api/install/disks`         | List block devices suitable for disk installation |
+| POST   | `/api/install/run`           | Install bee to disk (background task)             |
+
+---
+
+## Tools & NVMe
+
+| Method | Path                          | Description                                      |
+|--------|-------------------------------|--------------------------------------------------|
+| GET    | `/api/tools/check`            | Check availability of required CLI tools         |
+| GET    | `/api/tools/nvme-formats`     | List NVMe format options for a device            |
+| POST   | `/api/tools/nvme-format/run`  | Run nvme-format on a device                      |
+
+---
+
+## Live metrics
+
+| Method | Path                         | Description                                       |
+|--------|------------------------------|---------------------------------------------------|
+| GET    | `/api/metrics/stream`        | SSE: live metrics (GPU power, temp, utilization)  |
+| GET    | `/api/metrics/latest`        | Latest metrics snapshot (JSON)                    |
+| GET    | `/api/metrics/chart/`        | SVG chart for a metric over time                  |
+| GET    | `/api/metrics/export.csv`    | Download metrics history as CSV                   |
+
+---
+
+## Blackbox logging
+
+| Method | Path                       | Description                                   |
+|--------|----------------------------|-----------------------------------------------|
+| GET    | `/api/blackbox/status`     | Blackbox log state (enabled, size, path)      |
+| POST   | `/api/blackbox/enable`     | Start recording blackbox log                  |
+| POST   | `/api/blackbox/disable`    | Stop recording, flush to disk                 |
+
+---
+
+## UI pages
+
+| Method | Path       | Description                                   |
+|--------|------------|-----------------------------------------------|
+| GET    | `/`        | Main dashboard (serves all page routes)        |
+| GET    | `/viewer`  | Standalone JSON viewer for uploaded audit files |
+
+All pages are rendered server-side as HTML. The `/` route handles sub-paths such as
+`/network`, `/services`, `/sat`, `/benchmark`, `/install`, `/validate`, `/export`.
--- a/bible-local/architecture/data-model.md
+++ b/bible-local/architecture/data-model.md
@@ -0,0 +1,137 @@
+# Data Model
+
+The canonical output of `bee audit` is a `HardwareIngestRequest` JSON document accepted
+by the Reanimator `/api/ingest/hardware` endpoint. The ingest endpoint uses a strict
+decoder — unknown fields cause `400 Bad Request`.
+
+Source of truth: `audit/internal/schema/hardware.go`
+
+---
+
+## Top-level: HardwareIngestRequest
+
+```
+HardwareIngestRequest
+├── collected_at      string          RFC3339 UTC timestamp of collection
+├── hardware          HardwareSnapshot
+├── runtime           RuntimeHealth?  from bee-runtime-preflight service
+├── filename          string?
+├── source_type       string?
+├── protocol          string?
+└── target_host       string?
+```
+
+`collected_at` is the primary sort key used by Reanimator to deduplicate ingests.
+
+---
+
+## HardwareSnapshot
+
+All component arrays are `omitempty` — absent when the collector finds nothing.
+
+| JSON key          | Go type                    | Source                       |
+|-------------------|----------------------------|------------------------------|
+| `board`           | HardwareBoard              | dmidecode type 1/2           |
+| `firmware`        | []HardwareFirmwareRecord   | dmidecode type 0/13          |
+| `cpus`            | []HardwareCPU              | dmidecode type 4             |
+| `memory`          | []HardwareMemory           | dmidecode type 17            |
+| `storage`         | []HardwareStorage          | lsblk + nvme-cli + smartctl  |
+| `pcie_devices`    | []HardwarePCIeDevice       | lspci                        |
+| `power_supplies`  | []HardwarePowerSupply      | ipmitool fru + sdr           |
+| `sensors`         | *HardwareSensors           | sensors -j                   |
+| `event_logs`      | []HardwareEventLog         | ipmitool sel + journald      |
+| `platform_config` | *json.RawMessage           | reserved, nil until used     |
+| `vroc_license`    | *string                    | vroc-cli                     |
+
+---
+
+## Identity keys
+
+Reanimator uses these fields to match components across successive audits:
+
+| Component      | Identity key                                    |
+|----------------|------------------------------------------------|
+| Board          | `board.serial_number` (required, never empty)  |
+| CPU            | `serial_number` if present; else generated key |
+| Memory DIMM    | `serial_number` — absent DIMMs have `present: false` |
+| Storage        | `serial_number` if present; else `linux_device` from Telemetry |
+| PCIe device    | `bdf` (Bus:Device.Function address)            |
+| PSU            | `slot`                                          |
+
+Components without a stable identity are still emitted but may not be matched across runs.
+
+---
+
+## HardwareComponentStatus (embedded in all components)
+
+```go
+type HardwareComponentStatus struct {
+    Status           *string `json:"status,omitempty"`            // OK | Warning | Critical | Unknown
+    ErrorDescription *string `json:"error_description,omitempty"`
+}
+```
+
+Status is set by collectors and overwritten at render time by `ApplySATOverlay`
+(latest SAT run results are always merged on top before display).
+
+---
+
+## HardwarePCIeDevice
+
+The most enriched component type. Key fields:
+
+| JSON key             | Meaning                                        |
+|----------------------|------------------------------------------------|
+| `bdf`                | PCI address (identity key), e.g. `0000:4b:00.0` |
+| `vendor_id`          | Numeric PCI vendor ID (hex). Use this for classification — not `manufacturer`. |
+| `device_id`          | Numeric PCI device ID (hex)                    |
+| `device_class`       | Human-readable class, e.g. `VideoController`   |
+| `manufacturer`       | String label from lspci — for display only     |
+| `model`              | From nvidia-smi / rocm-smi — display name      |
+| `link_speed`         | Current PCIe link speed, e.g. `Gen4`           |
+| `max_link_speed`     | Max negotiated speed                           |
+| `link_width`         | Current lane count                             |
+| `max_link_width`     | Max lane count                                 |
+| `temperature_c`      | From nvidia-smi / rocm-smi                     |
+| `power_w`            | Current power draw                             |
+| `ecc_uncorrected_total` | Cumulative ECC uncorrected errors (NVIDIA)  |
+| `ecc_corrected_total`   | Cumulative ECC corrected errors (NVIDIA)    |
+| `hw_slowdown`        | HW throttle active (NVIDIA)                    |
+| `telemetry`          | Free-form map for vendor-specific extras       |
+
+**Classification rule**: use `vendor_id` (numeric PCI ID), never `manufacturer` string.
+
+| Vendor    | vendor_id |
+|-----------|-----------|
+| NVIDIA    | `0x10de`  |
+| AMD       | `0x1002`  |
+| Mellanox  | `0x15b3`  |
+| Aspeed    | `0x1a03`  |
+| Intel     | `0x8086`  |
+
+Constants live in `audit/internal/collector/pci_vendors.go`.
+
+---
+
+## HardwareMemory
+
+`location` field exists in the Go struct with `json:"-"` — it is intentionally excluded
+from JSON output because the Reanimator schema does not include it. It is used internally
+for DIMM telemetry matching only (`collector/memory_telemetry.go`).
+
+---
+
+## HardwareSensors
+
+Sensor structs (`HardwareFanSensor`, `HardwareTemperatureSensor`,
+`HardwarePowerSensor`, `HardwareOtherSensor`) do **not** have a `location` field.
+Location was removed in contract v2.8. The Go types mirror the schema exactly.
+
+---
+
+## JSON naming convention
+
+All JSON keys are `snake_case`. Go field names are `CamelCase`. The mapping is
+maintained by struct tags in `audit/internal/schema/hardware.go`.
+
+All pointer fields use `omitempty` — absent means not collected (not zero).
--- a/bible-local/architecture/runtime-flows.md
+++ b/bible-local/architecture/runtime-flows.md
@@ -149,7 +149,6 @@ Current validation state:
  6. psu collector     (ipmitool fru + sdr — silent if no /dev/ipmi0)
  7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
  8. output JSON → /var/log/bee-audit.json
-  9. QR summary to stdout (qrencode if available)
 ```

 Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
--- a/bible-local/architecture/system-overview.md
+++ b/bible-local/architecture/system-overview.md
@@ -58,6 +58,8 @@ Fills gaps where Redfish/logpile is blind:
 - `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
 - Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
 - Contract fields that have no honest local source on a generic Linux host may remain empty.
+- Embedded submodules such as `internal/chart/` and `bible/` are read-only for `bee` feature work.
+- If the UI needs extra information, `bee` must emit it through the standard audit JSON contract rather than patching `chart`.

 ## Tech stack

@@ -101,7 +103,7 @@ Fills gaps where Redfish/logpile is blind:
 | `iso/builder/` | ISO build scripts and `live-build` profile |
 | `iso/overlay/` | Source overlay copied into a staged build overlay |
 | `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
-| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
+| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web`; update by submodule pointer only, never by local `bee`-specific edits |
 | `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
 | `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
 | `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
--- a/bible-local/decisions/2026-04-29-read-only-embedded-submodules.md
+++ b/bible-local/decisions/2026-04-29-read-only-embedded-submodules.md
@@ -0,0 +1,39 @@
+# Decision: Treat embedded submodules as read-only
+
+## Context
+
+`bee` embeds external git submodules such as:
+
+- `internal/chart/` — `reanimator/chart`, a generic read-only viewer for Reanimator JSON snapshots
+- `bible/` — shared engineering rules and contracts
+
+These repositories are reused by other projects. A local feature request in `bee`
+must not be solved by silently changing shared submodule behavior.
+
+The concrete failure mode here was attempting to add project-specific storage
+telemetry presentation by editing `internal/chart/`. That couples a shared viewer
+to one host application's needs and creates hidden cross-project regressions.
+
+## Decision
+
+Embedded submodules are read-only from the point of view of `bee`.
+
+- Do not implement `bee`-specific behavior by editing `internal/chart/`.
+- Do not implement `bee`-specific behavior by editing `bible/`.
+- If `bee` needs new data in the report, produce it in the standard audit JSON
+  emitted by `bee` itself.
+- `chart` must continue to consume the canonical snapshot as an external viewer,
+  without host-specific forks.
+- Updating a submodule pointer to an upstream commit is allowed.
+- Carrying local unmerged submodule commits as part of a `bee` feature is forbidden.
+
+## Consequences
+
+- Audit/report features must be expressed through the contract in
+  `bible-local/docs/hardware-ingest-contract.md`.
+- `bee` owns collection, normalization, and serialization of storage telemetry in
+  `hardware.storage[]`.
+- `chart` remains a pure visualization module that reads the snapshot it is given.
+- If a capability is genuinely missing in a shared submodule, it must be proposed
+  and landed upstream as a generic change first, then pulled into `bee` via a
+  normal submodule update.
--- a/bible-local/decisions/2026-06-12-pcie-disabled-device-link-warning.md
+++ b/bible-local/decisions/2026-06-12-pcie-disabled-device-link-warning.md
@@ -0,0 +1,41 @@
+# Decision: Skip PCIe link-speed warnings for disabled devices
+
+**Date:** 2026-06-12
+**Status:** active
+
+## Context
+
+On HGX H100 SXM5 baseboards, the Microchip Switchtec PM41028 PSX PCIe switch
+(vendor 11F8, device 4128, NVIDIA subsystem 10DE:1643) appears in `lspci` as a
+"Memory controller". Its upstream link trains at Gen3 x2 while the device is
+capable of Gen4 x16. The device is permanently in a disabled state: memory access
+and bus-mastering are both off (Mem-, BusMaster-); `/sys/bus/pci/devices/<bdf>/enable`
+reads `0`.
+
+This chip is the PCIe fabric management endpoint for the NVSwitch interconnect — it
+carries only management traffic at low bandwidth and is intentionally not activated
+by any Linux driver. The bee audit was reporting a `statusWarning` with message
+"PCIe link speed degraded" for this device, which is misleading because the device
+is not in the data path.
+
+## Decision
+
+`applyPCIeLinkSpeedWarning` reads `/sys/bus/pci/devices/<bdf>/enable` via the
+existing `readPCIIntAttribute` helper. If the value is `0` the function returns
+early without setting any warning status.
+
+The check is vendor-agnostic: it applies to any PCIe device that Linux has not
+activated, regardless of make or model. This is consistent with the
+`no-hardcoded-vendors` contract — no vendor ID, device ID, or name string is
+used as a condition.
+
+## Consequences
+
+- PCIe fabric management endpoints, IPMI virtual devices, and other permanently
+  disabled PCIe functions no longer produce spurious link-degradation warnings.
+- Real link degradation on active devices (GPUs, NICs, NVMe, NVLink bridges)
+  continues to be detected and reported as before.
+- NVLink bridge cards retain their existing `statusCritical` path (they are always
+  enabled, so the early return is never taken for them).
+- The Switchtec device on HGX H100 boards shows `statusOK` with no
+  `error_description` in the audit JSON.
--- a/Show More
+++ b/Show More