Fix webui streaming recovery regressions

Add NVIDIA benchmark reporting flow
Add stability hardening and self-heal recovery
2026-04-05 10:39:09 +03:00 · 2026-04-05 10:30:56 +03:00 · 2026-04-05 10:29:37 +03:00 · 2026-04-05 09:57:38 +03:00 · 2026-04-04 15:23:15 +03:00 · 2026-04-04 15:18:43 +03:00
112 changed files with 14634 additions and 1135 deletions
--- a/PLAN.md
+++ b/PLAN.md
@@ -343,9 +343,9 @@ Planned code shape:
 - `bee tui` can rerun the audit manually
 - `bee tui` can export the latest audit JSON to removable media
 - `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
+- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
 - SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
+- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
 - removable export requires explicit target selection, mount, confirmation, copy, and cleanup

 ### 2.6 — Vendor utilities and optional assets
--- a/audit/Makefile
+++ b/audit/Makefile
@@ -1,5 +1,7 @@
 LISTEN ?= :8080
 AUDIT_PATH ?=
+VERSION ?= $(shell sh ./scripts/resolve-version.sh)
+GO_LDFLAGS := -X main.Version=$(VERSION)

 RUN_ARGS := web --listen $(LISTEN)
 ifneq ($(AUDIT_PATH),)
@@ -9,10 +11,10 @@ endif
 .PHONY: run build test

 run:
-	go run ./cmd/bee $(RUN_ARGS)
+	go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)

 build:
-	go build -o bee ./cmd/bee
+	go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee

 test:
 	go test ./...
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -1,11 +1,14 @@
 package main

 import (
+	"context"
 	"flag"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
+	"runtime/debug"
+	"strconv"
 	"strings"

 	"bee/audit/internal/app"
@@ -16,14 +19,31 @@ import (

 var Version = "dev"

+func buildLabel() string {
+	label := strings.TrimSpace(Version)
+	if label == "" {
+		return "dev"
+	}
+	return label
+}
+
 func main() {
 	os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
 }

-func run(args []string, stdout, stderr io.Writer) int {
+func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level: slog.LevelInfo,
 	})))
+	defer func() {
+		if rec := recover(); rec != nil {
+			slog.Error("fatal panic",
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+			exitCode = 1
+		}
+	}()

 	if len(args) == 0 {
 		printRootUsage(stderr)
@@ -49,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
 		return runWeb(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
+	case "benchmark":
+		return runBenchmark(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -67,6 +89,7 @@ func printRootUsage(w io.Writer) {
  bee support-bundle --output stdout|file:<path>
  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
+  bee benchmark nvidia [--profile standard|stability|overnight]
  bee version
  bee help [command]`)
 }
@@ -85,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
+	case "benchmark":
+		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -139,7 +164,6 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

-
 func runExport(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("export", flag.ContinueOnError)
 	fs.SetOutput(stderr)
@@ -299,6 +323,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {

 	if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
 		Title:       *title,
+		BuildLabel:  buildLabel(),
 		AuditPath:   *auditPath,
 		ExportDir:   *exportDir,
 		App:         app.New(platform.New()),
@@ -323,6 +348,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("sat", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
+	diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
 	if err := fs.Parse(args[1:]); err != nil {
 		if err == flag.ErrHelp {
 			return 0
@@ -337,7 +363,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	target := args[0]
 	if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
 		fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
-		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
+		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
 		return 2
 	}

@@ -346,19 +372,25 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 		archive string
 		err     error
 	)
+	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
 	switch target {
 	case "nvidia":
-		archive, err = application.RunNvidiaAcceptancePack("")
+		level := *diagLevel
+		if level > 0 {
+			_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
+		} else {
+			archive, err = application.RunNvidiaAcceptancePack("", logLine)
+		}
 	case "memory":
-		archive, err = application.RunMemoryAcceptancePack("")
+		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
 	case "storage":
-		archive, err = application.RunStorageAcceptancePack("")
+		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
 	case "cpu":
 		dur := *duration
 		if dur <= 0 {
 			dur = 60
 		}
-		archive, err = application.RunCPUAcceptancePack("", dur)
+		archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
 	}
 	if err != nil {
 		slog.Error("run sat", "target", target, "err", err)
@@ -367,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
+
+func runBenchmark(args []string, stdout, stderr io.Writer) int {
+	if len(args) == 0 {
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
+		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 0
+	}
+	target := args[0]
+	if target != "nvidia" {
+		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+
+	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
+	devices := fs.String("devices", "", "comma-separated GPU indices to include")
+	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
+	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
+	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
+	if err := fs.Parse(args[1:]); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
+		return 2
+	}
+
+	includeIndices, err := parseBenchmarkIndexCSV(*devices)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
+		return 2
+	}
+	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
+		return 2
+	}
+
+	application := app.New(platform.New())
+	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
+	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
+		Profile:           *profile,
+		SizeMB:            *sizeMB,
+		GPUIndices:        includeIndices,
+		ExcludeGPUIndices: excludeIndices,
+		RunNCCL:           !*skipNCCL,
+	}, logLine)
+	if err != nil {
+		slog.Error("run benchmark", "target", target, "err", err)
+		return 1
+	}
+	slog.Info("benchmark archive written", "target", target, "path", archive)
+	return 0
+}
+
+func parseBenchmarkIndexCSV(raw string) ([]int, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil, nil
+	}
+	var indices []int
+	for _, part := range strings.Split(raw, ",") {
+		part = strings.TrimSpace(part)
+		if part == "" {
+			continue
+		}
+		value, err := strconv.Atoi(part)
+		if err != nil || value < 0 {
+			return nil, fmt.Errorf("bad gpu index %q", part)
+		}
+		indices = append(indices, value)
+	}
+	return indices, nil
+}
--- a/audit/cmd/bee/main_test.go
+++ b/audit/cmd/bee/main_test.go
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
 }

 func TestRunVersion(t *testing.T) {
-	t.Parallel()
-
 	old := Version
 	Version = "test-version"
 	t.Cleanup(func() { Version = old })
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
 	}
 }

+func TestBuildLabelUsesVersionAsIs(t *testing.T) {
+	old := Version
+	Version = "1.2.3"
+	t.Cleanup(func() { Version = old })
+
+	if got := buildLabel(); got != "1.2.3" {
+		t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
+	}
+}
+
 func TestRunExportRequiresTarget(t *testing.T) {
 	t.Parallel()

--- a/audit/go.mod
+++ b/audit/go.mod
@@ -1,6 +1,6 @@
 module bee/audit

-go 1.24.0
+go 1.25.0

 replace reanimator/chart => ../internal/chart

@@ -13,5 +13,14 @@ require (
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/go-analyze/bulk v0.1.3 // indirect
 	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/ncruces/go-strftime v1.0.0 // indirect
+	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
 	golang.org/x/image v0.24.0 // indirect
+	golang.org/x/sys v0.42.0 // indirect
+	modernc.org/libc v1.70.0 // indirect
+	modernc.org/mathutil v1.7.1 // indirect
+	modernc.org/memory v1.11.0 // indirect
+	modernc.org/sqlite v1.48.0 // indirect
 )
--- a/audit/go.sum
+++ b/audit/go.sum
@@ -8,11 +8,30 @@ github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00
 github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
+github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
+github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
 golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
+golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
+modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
+modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
+modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
+modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
+modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
+modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,17 +19,18 @@ import (
 )

 var (
-	DefaultExportDir       = "/appdata/bee/export"
-	DefaultAuditJSONPath   = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath    = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath      = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath  = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath   = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath      = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath  = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir     = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir      = DefaultExportDir + "/bee-sat"
+	DefaultExportDir        = "/appdata/bee/export"
+	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
+	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
+	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
+	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
+	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
+	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
+	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
+	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
 )

 type App struct {
@@ -40,6 +41,8 @@ type App struct {
 	sat       satRunner
 	runtime   runtimeChecker
 	installer installer
+	// StatusDB is the unified component health store (nil if unavailable).
+	StatusDB *ComponentStatusDB
 }

 type ActionResult struct {
@@ -53,6 +56,10 @@ type networkManager interface {
 	DHCPOne(iface string) (string, error)
 	DHCPAll() (string, error)
 	SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
+	SetInterfaceState(iface string, up bool) error
+	GetInterfaceState(iface string) (bool, error)
+	CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
+	RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
 }

 type serviceManager interface {
@@ -75,20 +82,56 @@ type toolManager interface {
 type installer interface {
 	ListInstallDisks() ([]platform.InstallDisk, error)
 	InstallToDisk(ctx context.Context, device string, logFile string) error
+	IsLiveMediaInRAM() bool
+	LiveBootSource() platform.LiveBootSource
+	RunInstallToRAM(ctx context.Context, logFunc func(string)) error
+}
+
+type GPUPresenceResult struct {
+	Nvidia bool
+	AMD    bool
+}
+
+func (a *App) DetectGPUPresence() GPUPresenceResult {
+	vendor := a.sat.DetectGPUVendor()
+	return GPUPresenceResult{
+		Nvidia: vendor == "nvidia",
+		AMD:    vendor == "amd",
+	}
+}
+
+func (a *App) IsLiveMediaInRAM() bool {
+	return a.installer.IsLiveMediaInRAM()
+}
+
+func (a *App) LiveBootSource() platform.LiveBootSource {
+	return a.installer.LiveBootSource()
+}
+
+func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
+	return a.installer.RunInstallToRAM(ctx, logFunc)
 }

 type satRunner interface {
-	RunNvidiaAcceptancePack(baseDir string) (string, error)
-	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error)
-	RunMemoryAcceptancePack(baseDir string) (string, error)
-	RunStorageAcceptancePack(baseDir string) (string, error)
-	RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
+	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
+	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
+	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
+	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
 	DetectGPUVendor() string
 	ListAMDGPUs() ([]platform.AMDGPUInfo, error)
-	RunAMDAcceptancePack(baseDir string) (string, error)
+	RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
+	RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
+	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string) (string, error)
+	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 }

 type runtimeChecker interface {
@@ -97,7 +140,7 @@ type runtimeChecker interface {
 }

 func New(platform *platform.System) *App {
-	return &App{
+	a := &App{
 		network:   platform,
 		services:  platform,
 		exports:   platform,
@@ -106,6 +149,30 @@ func New(platform *platform.System) *App {
 		runtime:   platform,
 		installer: platform,
 	}
+	if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
+		a.StatusDB = db
+	}
+	return a
+}
+
+// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
+// and returns the updated JSON. Used by the web UI to serve always-fresh status.
+func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
+	snap, err := readAuditSnapshot(auditJSON)
+	if err != nil {
+		return nil, err
+	}
+	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
+	return json.MarshalIndent(snap, "", "  ")
+}
+
+func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
+	var snap schema.HardwareIngestRequest
+	if err := json.Unmarshal(auditJSON, &snap); err != nil {
+		return schema.HardwareIngestRequest{}, err
+	}
+	collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
+	return snap, nil
 }

 func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
@@ -115,7 +182,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		}
 	}
 	result := collector.Run(runtimeMode)
-	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -130,10 +197,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
-			return "", err
-		}
-		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -158,10 +222,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
-			return "", err
-		}
-		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -231,6 +292,9 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 	if err != nil {
 		return "", err
 	}
+	if normalized, normErr := ApplySATOverlay(data); normErr == nil {
+		data = normalized
+	}
 	if err := os.WriteFile(tmpPath, data, 0644); err != nil {
 		return "", err
 	}
@@ -301,6 +365,22 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
 	return a.network.SetStaticIPv4(cfg)
 }

+func (a *App) SetInterfaceState(iface string, up bool) error {
+	return a.network.SetInterfaceState(iface, up)
+}
+
+func (a *App) GetInterfaceState(iface string) (bool, error) {
+	return a.network.GetInterfaceState(iface)
+}
+
+func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
+	return a.network.CaptureNetworkSnapshot()
+}
+
+func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
+	return a.network.RestoreNetworkSnapshot(snapshot)
+}
+
 func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
 	body, err := a.network.SetStaticIPv4(cfg)
 	return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
@@ -416,15 +496,15 @@ func (a *App) AuditLogTailResult() ActionResult {
 	return ActionResult{Title: "Audit log tail", Body: body}
 }

-func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
+func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunNvidiaAcceptancePack(baseDir)
+	return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
 }

 func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
-	path, err := a.RunNvidiaAcceptancePack(baseDir)
+	path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
 	body := "Archive written."
 	if path != "" {
 		body = "Archive written to " + path
@@ -436,11 +516,11 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return a.sat.ListNvidiaGPUs()
 }

-func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (ActionResult, error) {
+func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices)
+	path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
 	body := "Archive written."
 	if path != "" {
 		body = "Archive written to " + path
@@ -448,39 +528,73 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
 	return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
 }

-func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
+func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBenchmarkBaseDir
+	}
+	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunMemoryAcceptancePack(baseDir)
+	return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
+}
+
+func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
 }

 func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
-	path, err := a.RunMemoryAcceptancePack(baseDir)
+	path, err := a.RunMemoryAcceptancePack(baseDir, nil)
 	return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
 }

-func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
+func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunCPUAcceptancePack(baseDir, durationSec)
+	return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
 }

 func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
-	path, err := a.RunCPUAcceptancePack(baseDir, durationSec)
+	path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
 	return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
 }

-func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
+func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
+}
+
+func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunStorageAcceptancePack(baseDir)
+	return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
 }

 func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
-	path, err := a.RunStorageAcceptancePack(baseDir)
+	path, err := a.RunStorageAcceptancePack(baseDir, nil)
 	return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
 }

@@ -492,18 +606,63 @@ func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
 	return a.sat.ListAMDGPUs()
 }

-func (a *App) RunAMDAcceptancePack(baseDir string) (string, error) {
+func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
+}
+
+func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunAMDAcceptancePack(baseDir)
+	return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
 }

 func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
-	path, err := a.RunAMDAcceptancePack(baseDir)
+	path, err := a.RunAMDAcceptancePack(baseDir, nil)
 	return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
 }

+func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
 func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -511,8 +670,15 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
 	return a.sat.RunFanStressTest(ctx, baseDir, opts)
 }

+func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
+}
+
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir)
+	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
@@ -597,6 +763,7 @@ func (a *App) HealthSummaryResult() ActionResult {
 	if err := json.Unmarshal(raw, &snapshot); err != nil {
 		return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
 	}
+	collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)

 	summary := collector.BuildHealthSummary(snapshot.Hardware)
 	var body strings.Builder
@@ -631,6 +798,7 @@ func (a *App) MainBanner() string {
 	if err := json.Unmarshal(raw, &snapshot); err != nil {
 		return ""
 	}
+	collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)

 	var lines []string
 	if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -43,6 +43,13 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
 	return f.setStaticIPv4Fn(cfg)
 }

+func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
+func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
+func (f fakeNetwork) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
+	return platform.NetworkSnapshot{}, nil
+}
+func (f fakeNetwork) RestoreNetworkSnapshot(platform.NetworkSnapshot) error { return nil }
+
 type fakeServices struct {
 	serviceStatusFn func(string) (string, error)
 	serviceDoFn     func(string, platform.ServiceAction) (string, error)
@@ -113,21 +120,37 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }

 type fakeSAT struct {
-	runNvidiaFn      func(string) (string, error)
-	runMemoryFn      func(string) (string, error)
-	runStorageFn     func(string) (string, error)
-	runCPUFn         func(string, int) (string, error)
-	detectVendorFn   func() string
-	listAMDGPUsFn    func() ([]platform.AMDGPUInfo, error)
-	runAMDPackFn     func(string) (string, error)
-	listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
+	runNvidiaFn          func(string) (string, error)
+	runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaStressFn    func(string, platform.NvidiaStressOptions) (string, error)
+	runMemoryFn          func(string) (string, error)
+	runStorageFn         func(string) (string, error)
+	runCPUFn             func(string, int) (string, error)
+	detectVendorFn       func() string
+	listAMDGPUsFn        func() ([]platform.AMDGPUInfo, error)
+	runAMDPackFn         func(string) (string, error)
+	listNvidiaGPUsFn     func() ([]platform.NvidiaGPU, error)
 }

-func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
+func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
 	return f.runNvidiaFn(baseDir)
 }

-func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int) (string, error) {
+func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) {
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
+	if f.runNvidiaBenchmarkFn != nil {
+		return f.runNvidiaBenchmarkFn(baseDir, opts)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
+	if f.runNvidiaStressFn != nil {
+		return f.runNvidiaStressFn(baseDir, opts)
+	}
 	return f.runNvidiaFn(baseDir)
 }

@@ -138,15 +161,15 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return nil, nil
 }

-func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
+func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
 	return f.runMemoryFn(baseDir)
 }

-func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
+func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
 	return f.runStorageFn(baseDir)
 }

-func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
+func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
 	if f.runCPUFn != nil {
 		return f.runCPUFn(baseDir, durationSec)
 	}
@@ -167,18 +190,40 @@ func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
 	return nil, nil
 }

-func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
+func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
 	if f.runAMDPackFn != nil {
 		return f.runAMDPackFn(baseDir)
 	}
 	return "", nil
 }

+func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
+	return "", nil
+}
+
+func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
+	return "", nil
+}
+
+func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
+	return "", nil
+}
+func (f fakeSAT) RunMemoryStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
+	return "", nil
+}
+func (f fakeSAT) RunSATStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
+	return "", nil
+}
+
 func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
 	return "", nil
 }

-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
+func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
+	return "", nil
+}
+
+func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
 	return "", nil
 }

@@ -574,13 +619,13 @@ func TestRunSATDefaultsToExportDir(t *testing.T) {
 		},
 	}

-	if _, err := a.RunNvidiaAcceptancePack(""); err != nil {
+	if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil {
 		t.Fatal(err)
 	}
-	if _, err := a.RunMemoryAcceptancePack(""); err != nil {
+	if _, err := a.RunMemoryAcceptancePack("", nil); err != nil {
 		t.Fatal(err)
 	}
-	if _, err := a.RunStorageAcceptancePack(""); err != nil {
+	if _, err := a.RunStorageAcceptancePack("", nil); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -623,13 +668,50 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
 	}
 }

+func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
+	tmp := t.TempDir()
+	oldSATBaseDir := DefaultSATBaseDir
+	DefaultSATBaseDir = filepath.Join(tmp, "sat")
+	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
+
+	raw := `{
+	  "collected_at": "2026-03-15T10:00:00Z",
+	  "hardware": {
+	    "board": {"serial_number": "SRV123"},
+	    "storage": [
+	      {"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
+	      {"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
+	    ],
+	    "pcie_devices": [
+	      {"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
+	      {"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
+	    ]
+	  }
+	}`
+
+	got, err := ApplySATOverlay([]byte(raw))
+	if err != nil {
+		t.Fatalf("ApplySATOverlay error: %v", err)
+	}
+	text := string(got)
+	if contains(text, "Virtual HDisk0") {
+		t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
+	}
+	if contains(text, "\"device_class\": \"Co-processor\"") {
+		t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
+	}
+	if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
+		t.Fatalf("overlaid audit should keep real devices:\n%s", text)
+	}
+}
+
 func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tmp := t.TempDir()
 	exportDir := filepath.Join(tmp, "export")
 	if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
 		t.Fatal(err)
 	}
-	if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
+	if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
@@ -661,6 +743,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {

 	tr := tar.NewReader(gzr)
 	var names []string
+	var auditJSON string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -670,6 +753,33 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			t.Fatalf("read tar entry: %v", err)
 		}
 		names = append(names, hdr.Name)
+		if contains(hdr.Name, "/export/bee-audit.json") {
+			body, err := io.ReadAll(tr)
+			if err != nil {
+				t.Fatalf("read audit entry: %v", err)
+			}
+			auditJSON = string(body)
+		}
+	}
+
+	for _, want := range []string{
+		"/system/ip-link.txt",
+		"/system/ip-link-stats.txt",
+		"/system/ethtool-info.txt",
+		"/system/ethtool-link.txt",
+		"/system/ethtool-module.txt",
+		"/system/mstflint-query.txt",
+	} {
+		var found bool
+		for _, name := range names {
+			if contains(name, want) {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("support bundle missing %s, names=%v", want, names)
+		}
 	}

 	var foundRaw bool
@@ -684,6 +794,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !foundRaw {
 		t.Fatalf("support bundle missing raw SAT log, names=%v", names)
 	}
+	if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
+		t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
+	}
+	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
+		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
+	}
 }

 func TestMainBanner(t *testing.T) {
@@ -697,6 +813,10 @@ func TestMainBanner(t *testing.T) {
 	product := "PowerEdge R760"
 	cpuModel := "Intel Xeon Gold 6430"
 	memoryType := "DDR5"
+	memorySerialA := "DIMM-A"
+	memorySerialB := "DIMM-B"
+	storageSerialA := "DISK-A"
+	storageSerialB := "DISK-B"
 	gpuClass := "VideoController"
 	gpuModel := "NVIDIA H100"

@@ -712,12 +832,12 @@ func TestMainBanner(t *testing.T) {
 				{Model: &cpuModel},
 			},
 			Memory: []schema.HardwareMemory{
-				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
-				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
+				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
+				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
 			},
 			Storage: []schema.HardwareStorage{
-				{Present: &trueValue, SizeGB: intPtr(3840)},
-				{Present: &trueValue, SizeGB: intPtr(3840)},
+				{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
+				{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
 			},
 			PCIeDevices: []schema.HardwarePCIeDevice{
 				{DeviceClass: &gpuClass, Model: &gpuModel},
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -0,0 +1,48 @@
+package app
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
+	}
+
+	tmpPath := path + ".tmp"
+	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
+	if err != nil {
+		return fmt.Errorf("open temp %s: %w", tmpPath, err)
+	}
+
+	success := false
+	defer func() {
+		_ = f.Close()
+		if !success {
+			_ = os.Remove(tmpPath)
+		}
+	}()
+
+	if _, err := f.Write(data); err != nil {
+		return fmt.Errorf("write temp %s: %w", tmpPath, err)
+	}
+	if err := f.Sync(); err != nil {
+		return fmt.Errorf("sync temp %s: %w", tmpPath, err)
+	}
+	if err := f.Close(); err != nil {
+		return fmt.Errorf("close temp %s: %w", tmpPath, err)
+	}
+	if err := os.Rename(tmpPath, path); err != nil {
+		return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
+	}
+
+	if dir, err := os.Open(filepath.Dir(path)); err == nil {
+		_ = dir.Sync()
+		_ = dir.Close()
+	}
+
+	success = true
+	return nil
+}
--- a/audit/internal/app/atomic_write_test.go
+++ b/audit/internal/app/atomic_write_test.go
@@ -0,0 +1,71 @@
+package app
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"bee/audit/internal/schema"
+)
+
+func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "bee-audit.json")
+	if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
+		t.Fatalf("seed file: %v", err)
+	}
+
+	if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
+		t.Fatalf("atomicWriteFile: %v", err)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read final: %v", err)
+	}
+	if string(raw) != "new\n" {
+		t.Fatalf("final content=%q want %q", string(raw), "new\n")
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Fatalf("tmp file should be absent after success, err=%v", err)
+	}
+}
+
+func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "runtime-health.json")
+	a := &App{
+		runtime: fakeRuntime{
+			collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
+				return schema.RuntimeHealth{
+					Status:      "OK",
+					ExportDir:   exportDir,
+					DriverReady: true,
+					CUDAReady:   true,
+				}, nil
+			},
+		},
+	}
+
+	got, err := a.RunRuntimePreflight("file:" + path)
+	if err != nil {
+		t.Fatalf("RunRuntimePreflight: %v", err)
+	}
+	if got != path {
+		t.Fatalf("path=%q want %q", got, path)
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Fatalf("tmp file should be absent after success, err=%v", err)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read runtime file: %v", err)
+	}
+	var health schema.RuntimeHealth
+	if err := json.Unmarshal(raw, &health); err != nil {
+		t.Fatalf("json unmarshal: %v", err)
+	}
+	if health.Status != "OK" {
+		t.Fatalf("status=%q want OK", health.Status)
+	}
+}
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -0,0 +1,266 @@
+package app
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+)
+
+// ComponentStatusDB is a persistent, append-only store of hardware component health records.
+// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
+// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
+// the component stays at the highest observed severity until explicitly reset.
+type ComponentStatusDB struct {
+	path    string
+	mu      sync.Mutex
+	records map[string]*ComponentStatusRecord
+}
+
+// ComponentStatusRecord holds the current and historical health of one hardware component.
+type ComponentStatusRecord struct {
+	ComponentKey  string                  `json:"component_key"`
+	Status        string                  `json:"status"` // "OK", "Warning", "Critical", "Unknown"
+	LastCheckedAt time.Time               `json:"last_checked_at"`
+	LastChangedAt time.Time               `json:"last_changed_at"`
+	ErrorSummary  string                  `json:"error_summary,omitempty"`
+	History       []ComponentStatusEntry  `json:"history"`
+}
+
+// ComponentStatusEntry is one observation written to a component's history.
+type ComponentStatusEntry struct {
+	At     time.Time `json:"at"`
+	Status string    `json:"status"`
+	Source string    `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
+	Detail string    `json:"detail,omitempty"`
+}
+
+// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
+func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
+	db := &ComponentStatusDB{
+		path:    path,
+		records: make(map[string]*ComponentStatusRecord),
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return nil, err
+	}
+	data, err := os.ReadFile(path)
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+	if len(data) > 0 {
+		var records []ComponentStatusRecord
+		if err := json.Unmarshal(data, &records); err == nil {
+			for i := range records {
+				db.records[records[i].ComponentKey] = &records[i]
+			}
+		}
+	}
+	return db, nil
+}
+
+// Record writes one observation for the given component key.
+// source is a short label like "sat:nvidia" or "watchdog:kmsg".
+// status is "OK", "Warning", "Critical", or "Unknown".
+// OK never downgrades an existing Warning or Critical status.
+func (db *ComponentStatusDB) Record(key, source, status, detail string) {
+	if db == nil || strings.TrimSpace(key) == "" {
+		return
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+
+	now := time.Now().UTC()
+	rec, exists := db.records[key]
+	if !exists {
+		rec = &ComponentStatusRecord{ComponentKey: key}
+		db.records[key] = rec
+	}
+	rec.LastCheckedAt = now
+
+	entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
+	rec.History = append(rec.History, entry)
+
+	// Status merge: OK never downgrades Warning/Critical.
+	newSev := componentSeverity(status)
+	curSev := componentSeverity(rec.Status)
+	if newSev > curSev {
+		rec.Status = status
+		rec.LastChangedAt = now
+		rec.ErrorSummary = detail
+	} else if rec.Status == "" {
+		rec.Status = status
+		rec.LastChangedAt = now
+	}
+
+	_ = db.saveLocked()
+}
+
+// Get returns the current record for a component key.
+func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
+	if db == nil {
+		return ComponentStatusRecord{}, false
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+	r, ok := db.records[key]
+	if !ok {
+		return ComponentStatusRecord{}, false
+	}
+	return *r, true
+}
+
+// All returns a snapshot of all records.
+func (db *ComponentStatusDB) All() []ComponentStatusRecord {
+	if db == nil {
+		return nil
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+	out := make([]ComponentStatusRecord, 0, len(db.records))
+	for _, r := range db.records {
+		out = append(out, *r)
+	}
+	return out
+}
+
+func (db *ComponentStatusDB) saveLocked() error {
+	records := make([]ComponentStatusRecord, 0, len(db.records))
+	for _, r := range db.records {
+		records = append(records, *r)
+	}
+	data, err := json.MarshalIndent(records, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(db.path, data, 0644)
+}
+
+// componentSeverity returns a numeric severity so higher values win.
+func componentSeverity(status string) int {
+	switch strings.TrimSpace(status) {
+	case "Critical":
+		return 3
+	case "Warning":
+		return 2
+	case "OK":
+		return 1
+	default:
+		return 0
+	}
+}
+
+// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
+// and writes component status records to db for the given SAT target.
+// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
+func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
+	if db == nil || strings.TrimSpace(archivePath) == "" {
+		return
+	}
+	archivePath = extractArchivePath(archivePath)
+	if archivePath == "" {
+		return
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return
+	}
+	kv := parseSATKV(string(data))
+	overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
+	if overall == "" {
+		return
+	}
+
+	source := "sat:" + target
+	dbStatus := satStatusToDBStatus(overall)
+
+	// Map SAT target to component keys.
+	switch target {
+	case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
+		db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
+	case "memory", "memory-stress", "sat-stress":
+		db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
+	case "cpu", "platform-stress":
+		db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
+	case "storage":
+		// Try to record per-device if available in summary.
+		recordedAny := false
+		for key, val := range kv {
+			if !strings.HasSuffix(key, "_status") || key == "overall_status" {
+				continue
+			}
+			base := strings.TrimSuffix(key, "_status")
+			idx := strings.Index(base, "_")
+			if idx <= 0 {
+				continue
+			}
+			devName := base[:idx]
+			devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
+			db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
+			recordedAny = true
+		}
+		if !recordedAny {
+			db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
+		}
+	}
+}
+
+func satStatusToDBStatus(overall string) string {
+	switch overall {
+	case "OK":
+		return "OK"
+	case "FAILED":
+		return "Warning"
+	case "PARTIAL", "UNSUPPORTED":
+		return "Unknown"
+	default:
+		return "Unknown"
+	}
+}
+
+// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
+// "Archive written to /path/foo.tar.gz" or already a bare path.
+func ExtractArchivePath(s string) string {
+	return extractArchivePath(s)
+}
+
+// ReadSATOverallStatus reads the overall_status value from the summary.txt
+// file located in the run directory alongside archivePath.
+// Returns "" if the file cannot be read.
+func ReadSATOverallStatus(archivePath string) string {
+	if strings.TrimSpace(archivePath) == "" {
+		return ""
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return ""
+	}
+	kv := parseSATKV(string(data))
+	return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
+}
+
+func extractArchivePath(s string) string {
+	s = strings.TrimSpace(s)
+	if strings.HasSuffix(s, ".tar.gz") {
+		parts := strings.Fields(s)
+		if len(parts) > 0 {
+			return parts[len(parts)-1]
+		}
+	}
+	return s
+}
+
+func parseSATKV(raw string) map[string]string {
+	kv := make(map[string]string)
+	for _, line := range strings.Split(raw, "\n") {
+		k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
+		if ok {
+			kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
+		}
+	}
+	return kv
+}
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -9,7 +9,7 @@ import (
 	"bee/audit/internal/schema"
 )

-func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
+func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
 	if snap == nil || strings.TrimSpace(baseDir) == "" {
 		return
 	}
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
 	if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
 		applyStorageSAT(snap.Storage, summary)
 	}
+	// Apply unified component status DB — overlaid last so it can only upgrade severity.
+	applyComponentStatusDB(snap, db)
 }

 type satSummary struct {
@@ -141,9 +143,11 @@ func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
 func satKeyStatus(rawStatus, label string) (string, string, bool) {
 	switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
 	case "OK":
-		return "OK", label + " passed", true
+		// No error description on success — error_description is for problems only.
+		return "OK", "", true
 	case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
-		return "Warning", label + " incomplete", true
+		// Tool couldn't run or test was incomplete — we can't assert hardware health.
+		return "Unknown", "", true
 	case "FAILED":
 		return "Critical", label + " failed", true
 	default:
@@ -180,6 +184,8 @@ func statusSeverity(status string) int {
 		return 2
 	case "OK":
 		return 1
+	case "Unknown":
+		return 1 // same as OK — does not override OK from another source
 	default:
 		return 0
 	}
@@ -202,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
 	}
 }

+func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
+	if snap == nil || db == nil {
+		return
+	}
+	for _, rec := range db.All() {
+		key := rec.ComponentKey
+		status := dbStatusToSATStatus(rec.Status)
+		if status == "" {
+			continue
+		}
+		detail := rec.ErrorSummary
+		ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
+
+		switch {
+		case strings.HasPrefix(key, "pcie:"):
+			bdf := strings.TrimPrefix(key, "pcie:")
+			bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
+			// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
+			if sanitizeBDFForLookup(bdf) == "" {
+				break
+			}
+			normalized := sanitizeBDFForLookup(bdf)
+			for i := range snap.PCIeDevices {
+				if snap.PCIeDevices[i].BDF == nil {
+					continue
+				}
+				if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
+					mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
+				}
+			}
+		case strings.HasPrefix(key, "storage:"):
+			devName := strings.TrimPrefix(key, "storage:")
+			if devName == "all" {
+				for i := range snap.Storage {
+					mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
+				}
+			} else {
+				for i := range snap.Storage {
+					linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
+					if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
+						mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
+					}
+				}
+			}
+		case strings.HasPrefix(key, "memory:"):
+			for i := range snap.Memory {
+				mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
+			}
+		case strings.HasPrefix(key, "cpu:"):
+			for i := range snap.CPUs {
+				mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
+			}
+		}
+	}
+}
+
+// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
+// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
+func dbStatusToSATStatus(s string) string {
+	switch strings.TrimSpace(s) {
+	case "OK", "Warning", "Critical", "Unknown":
+		return s
+	default:
+		return ""
+	}
+}
+
+// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
+// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
+func sanitizeBDFForLookup(bdf string) string {
+	bdf = strings.ToLower(strings.TrimSpace(bdf))
+	if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
+		return ""
+	}
+	if strings.Count(bdf, ":") == 1 {
+		bdf = "0000:" + bdf
+	}
+	return bdf
+}
+
 func ptrString(v *string) string {
 	if v == nil {
 		return ""
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
 	usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
 	snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}

-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)

 	if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
 		t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		}},
 	}

-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)

 	if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
 	"bee-network.service",
 	"bee-nvidia.service",
 	"bee-preflight.service",
+	"bee-selfheal.service",
+	"bee-selfheal.timer",
 	"bee-sshsetup.service",
 }

@@ -27,15 +29,118 @@ var supportBundleCommands = []struct {
 	cmd  []string
 }{
 	{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
+	{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
 	{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
 	{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
+	{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
 	{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
+	{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
+	{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
 	{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
-	{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
+	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
+	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
+for d in /sys/bus/pci/devices/*/; do
+  vendor=$(cat "$d/vendor" 2>/dev/null)
+  [ "$vendor" = "0x10de" ] || continue
+  dev=$(basename "$d")
+  echo "=== $dev ==="
+  for f in current_link_speed current_link_width max_link_speed max_link_width; do
+    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
+  done
+done
+`}},
+	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool -i "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool -m "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
+if ! command -v mstflint >/dev/null 2>&1; then
+  echo "mstflint not found"
+  exit 0
+fi
+found=0
+for path in /sys/bus/pci/devices/*; do
+  [ -e "$path/vendor" ] || continue
+  vendor=$(cat "$path/vendor" 2>/dev/null)
+  [ "$vendor" = "0x15b3" ] || continue
+  bdf=$(basename "$path")
+  found=1
+  echo "=== $bdf ==="
+  mstflint -d "$bdf" q 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no Mellanox/NVIDIA networking devices found"
+fi
+`}},
 }

+var supportBundleOptionalFiles = []struct {
+	name string
+	src  string
+}{
+	{name: "system/kern.log", src: "/var/log/kern.log"},
+	{name: "system/syslog.txt", src: "/var/log/syslog"},
+}
+
+const supportBundleGlob = "bee-support-*.tar.gz"
+
 func BuildSupportBundle(exportDir string) (string, error) {
 	exportDir = strings.TrimSpace(exportDir)
 	if exportDir == "" {
@@ -75,6 +180,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
 			return "", err
 		}
 	}
+	for _, item := range supportBundleOptionalFiles {
+		_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
+	}
 	if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
 		return "", err
 	}
@@ -86,34 +194,64 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	return archivePath, nil
 }

+func LatestSupportBundlePath() (string, error) {
+	return latestSupportBundlePath(os.TempDir())
+}
+
 func cleanupOldSupportBundles(dir string) error {
-	matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
+	matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
 	if err != nil {
 		return err
 	}
-	type entry struct {
-		path string
-		mod  time.Time
+	entries := supportBundleEntries(matches)
+	for path, mod := range entries {
+		if time.Since(mod) > 24*time.Hour {
+			_ = os.Remove(path)
+			delete(entries, path)
+		}
 	}
-	list := make([]entry, 0, len(matches))
+	ordered := orderSupportBundles(entries)
+	if len(ordered) > 3 {
+		for _, old := range ordered[3:] {
+			_ = os.Remove(old)
+		}
+	}
+	return nil
+}
+
+func latestSupportBundlePath(dir string) (string, error) {
+	matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
+	if err != nil {
+		return "", err
+	}
+	ordered := orderSupportBundles(supportBundleEntries(matches))
+	if len(ordered) == 0 {
+		return "", os.ErrNotExist
+	}
+	return ordered[0], nil
+}
+
+func supportBundleEntries(matches []string) map[string]time.Time {
+	entries := make(map[string]time.Time, len(matches))
 	for _, match := range matches {
 		info, err := os.Stat(match)
 		if err != nil {
 			continue
 		}
-		if time.Since(info.ModTime()) > 24*time.Hour {
-			_ = os.Remove(match)
-			continue
-		}
-		list = append(list, entry{path: match, mod: info.ModTime()})
+		entries[match] = info.ModTime()
 	}
-	sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
-	if len(list) > 3 {
-		for _, old := range list[3:] {
-			_ = os.Remove(old.path)
-		}
+	return entries
+}
+
+func orderSupportBundles(entries map[string]time.Time) []string {
+	ordered := make([]string, 0, len(entries))
+	for path := range entries {
+		ordered = append(ordered, path)
 	}
-	return nil
+	sort.Slice(ordered, func(i, j int) bool {
+		return entries[ordered[i]].After(entries[ordered[j]])
+	})
+	return ordered
 }

 func writeJournalDump(dst string) error {
@@ -152,6 +290,24 @@ func writeCommandOutput(dst string, cmd []string) error {
 	return os.WriteFile(dst, raw, 0644)
 }

+func copyOptionalFile(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+	if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
+		return err
+	}
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+	_, err = io.Copy(out, in)
+	return err
+}
+
 func writeManifest(dst, exportDir, stageRoot string) error {
 	if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
 		return err
@@ -215,7 +371,7 @@ func copyDirContents(srcDir, dstDir string) error {
 }

 func copyExportDirForSupportBundle(srcDir, dstDir string) error {
-	return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
+	if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
 		cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
 		if cleanRel == "" {
 			return true
@@ -227,7 +383,25 @@ func copyExportDirForSupportBundle(srcDir, dstDir string) error {
 			return false
 		}
 		return true
-	})
+	}); err != nil {
+		return err
+	}
+	return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
+}
+
+func normalizeSupportBundleAuditJSON(path string) error {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+	normalized, err := ApplySATOverlay(data)
+	if err != nil {
+		return nil
+	}
+	return os.WriteFile(path, normalized, 0644)
 }

 func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
--- a/audit/internal/collector/finalize.go
+++ b/audit/internal/collector/finalize.go
@@ -1,10 +1,18 @@
 package collector

-import "bee/audit/internal/schema"
+import (
+	"bee/audit/internal/schema"
+	"strings"
+)
+
+func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
+	finalizeSnapshot(snap, collectedAt)
+}

 func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
 	snap.Memory = filterMemory(snap.Memory)
 	snap.Storage = filterStorage(snap.Storage)
+	snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
 	snap.PowerSupplies = filterPSUs(snap.PowerSupplies)

 	setComponentStatusMetadata(snap, collectedAt)
@@ -33,11 +41,25 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
 		if disk.SerialNumber == nil || *disk.SerialNumber == "" {
 			continue
 		}
+		if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
+			continue
+		}
 		out = append(out, disk)
 	}
 	return out
 }

+func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
+	out := make([]schema.HardwarePCIeDevice, 0, len(devs))
+	for _, dev := range devs {
+		if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
+			continue
+		}
+		out = append(out, dev)
+	}
+	return out
+}
+
 func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
 	out := make([]schema.HardwarePowerSupply, 0, len(psus))
 	for _, psu := range psus {
--- a/audit/internal/collector/finalize_test.go
+++ b/audit/internal/collector/finalize_test.go
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 	present := true
 	status := statusOK
 	serial := "SN-1"
+	virtualModel := "Virtual HDisk1"
+	realModel := "PASCARI"
+	coProcessorClass := "Co-processor"
+	gpuClass := "VideoController"

 	snap := schema.HardwareSnapshot{
 		Memory: []schema.HardwareMemory{
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 			{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 		},
 		Storage: []schema.HardwareStorage{
+			{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
+			{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 		},
+		PCIeDevices: []schema.HardwarePCIeDevice{
+			{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
+			{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
+		},
 		PowerSupplies: []schema.HardwarePowerSupply{
 			{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 	if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
 	}
-	if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
+	if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
 	}
+	if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
+		t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
+	}
 	if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
 	}
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -2,18 +2,21 @@ package collector

 import (
 	"bee/audit/internal/schema"
+	"context"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"time"
 )

 const mellanoxVendorID = 0x15b3
+const nicProbeTimeout = 2 * time.Second

 var (
 	mstflintQuery = func(bdf string) (string, error) {
-		out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
 		if err != nil {
 			return "", err
 		}
@@ -21,7 +24,7 @@ var (
 	}

 	ethtoolInfoQuery = func(iface string) (string, error) {
-		out, err := exec.Command("ethtool", "-i", iface).Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
 		if err != nil {
 			return "", err
 		}
@@ -29,6 +32,14 @@ var (
 	}

 	netIfacesByBDF = listNetIfacesByBDF
+	readNetCarrierFile = func(iface string) (string, error) {
+		path := filepath.Join("/sys/class/net", iface, "carrier")
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			return "", err
+		}
+		return strings.TrimSpace(string(raw)), nil
+	}
 )

 // enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
 	}
 	return ifaces
 }
+
+func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	return exec.CommandContext(ctx, name, args...).Output()
+}
+
+func interfaceHasCarrier(iface string) bool {
+	raw, err := readNetCarrierFile(iface)
+	if err != nil {
+		return false
+	}
+	return strings.TrimSpace(raw) == "1"
+}
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -12,7 +12,7 @@ import (

 var (
 	ethtoolModuleQuery = func(iface string) (string, error) {
-		out, err := raidToolQuery("ethtool", "-m", iface)
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
 		if err != nil {
 			return "", err
 		}
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}

-		if out, err := ethtoolModuleQuery(iface); err == nil {
-			if injectSFPDOMTelemetry(&devs[i], out) {
-				enriched++
-				continue
+		if interfaceHasCarrier(iface) {
+			if out, err := ethtoolModuleQuery(iface); err == nil {
+				if injectSFPDOMTelemetry(&devs[i], out) {
+					enriched++
+					continue
+				}
 			}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
+	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		queryPCILSPCIDetail = origDetail
 		readPCIVPDFile = origVPD
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
+		readNetCarrierFile = origCarrier
 	})

 	queryPCILSPCIDetail = func(bdf string) (string, error) {
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		}
 		return "aa:bb:cc:dd:ee:ff", nil
 	}
+	readNetCarrierFile = func(string) (string, error) { return "1", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }

@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	}
 }

+func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
+	origIfaces := netIfacesByBDF
+	origReadMAC := readNetAddressFile
+	origEth := ethtoolInfoQuery
+	origModule := ethtoolModuleQuery
+	origCarrier := readNetCarrierFile
+	t.Cleanup(func() {
+		netIfacesByBDF = origIfaces
+		readNetAddressFile = origReadMAC
+		ethtoolInfoQuery = origEth
+		ethtoolModuleQuery = origModule
+		readNetCarrierFile = origCarrier
+	})
+
+	netIfacesByBDF = func(string) []string { return []string{"eth0"} }
+	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
+	readNetCarrierFile = func(string) (string, error) { return "0", nil }
+	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
+	ethtoolModuleQuery = func(string) (string, error) {
+		t.Fatal("ethtool -m should not be called without carrier")
+		return "", nil
+	}
+
+	class := "EthernetController"
+	bdf := "0000:18:00.0"
+	devs := []schema.HardwarePCIeDevice{{
+		DeviceClass: &class,
+		BDF:         &bdf,
+	}}
+
+	out := enrichPCIeWithNICTelemetry(devs)
+	if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
+		t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
+	}
+}
+
 func TestDBMValue(t *testing.T) {
 	tests := []struct {
 		in   string
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -13,14 +13,18 @@ import (
 const nvidiaVendorID = 0x10de

 type nvidiaGPUInfo struct {
-	BDF            string
-	Serial         string
-	VBIOS          string
-	TemperatureC   *float64
-	PowerW         *float64
-	ECCUncorrected *int64
-	ECCCorrected   *int64
-	HWSlowdown     *bool
+	BDF                string
+	Serial             string
+	VBIOS              string
+	TemperatureC       *float64
+	PowerW             *float64
+	ECCUncorrected     *int64
+	ECCCorrected       *int64
+	HWSlowdown         *bool
+	PCIeLinkGenCurrent *int
+	PCIeLinkGenMax     *int
+	PCIeLinkWidthCur   *int
+	PCIeLinkWidthMax   *int
 }

 // enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
 	out, err := exec.Command(
 		"nvidia-smi",
-		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
+		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		if len(rec) == 0 {
 			continue
 		}
-		if len(rec) < 9 {
-			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
+		if len(rec) < 13 {
+			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
 		}

 		bdf := normalizePCIeBDF(rec[1])
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		}

 		info := nvidiaGPUInfo{
-			BDF:            bdf,
-			Serial:         strings.TrimSpace(rec[2]),
-			VBIOS:          strings.TrimSpace(rec[3]),
-			TemperatureC:   parseMaybeFloat(rec[4]),
-			PowerW:         parseMaybeFloat(rec[5]),
-			ECCUncorrected: parseMaybeInt64(rec[6]),
-			ECCCorrected:   parseMaybeInt64(rec[7]),
-			HWSlowdown:     parseMaybeBool(rec[8]),
+			BDF:                bdf,
+			Serial:             strings.TrimSpace(rec[2]),
+			VBIOS:              strings.TrimSpace(rec[3]),
+			TemperatureC:       parseMaybeFloat(rec[4]),
+			PowerW:             parseMaybeFloat(rec[5]),
+			ECCUncorrected:     parseMaybeInt64(rec[6]),
+			ECCCorrected:       parseMaybeInt64(rec[7]),
+			HWSlowdown:         parseMaybeBool(rec[8]),
+			PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
+			PCIeLinkGenMax:     parseMaybeInt(rec[10]),
+			PCIeLinkWidthCur:   parseMaybeInt(rec[11]),
+			PCIeLinkWidthMax:   parseMaybeInt(rec[12]),
 		}
 		result[bdf] = info
 	}
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
 	return &n
 }

+func parseMaybeInt(v string) *int {
+	v = strings.TrimSpace(v)
+	if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
+		return nil
+	}
+	n, err := strconv.Atoi(v)
+	if err != nil {
+		return nil
+	}
+	return &n
+}
+
+func pcieLinkGenLabel(gen int) string {
+	return fmt.Sprintf("Gen%d", gen)
+}
+
 func parseMaybeBool(v string) *bool {
 	v = strings.TrimSpace(strings.ToLower(v))
 	switch v {
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
 	if info.HWSlowdown != nil {
 		dev.HWSlowdown = info.HWSlowdown
 	}
+	// Override PCIe link speed/width with nvidia-smi driver values.
+	// sysfs current_link_speed reflects the instantaneous physical link state and
+	// can show Gen1 when the GPU is idle due to ASPM power management. The driver
+	// knows the negotiated speed regardless of the current power state.
+	if info.PCIeLinkGenCurrent != nil {
+		s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
+		dev.LinkSpeed = &s
+	}
+	if info.PCIeLinkGenMax != nil {
+		s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
+		dev.MaxLinkSpeed = &s
+	}
+	if info.PCIeLinkWidthCur != nil {
+		dev.LinkWidth = info.PCIeLinkWidthCur
+	}
+	if info.PCIeLinkWidthMax != nil {
+		dev.MaxLinkWidth = info.PCIeLinkWidthMax
+	}
 }
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -6,7 +6,7 @@ import (
 )

 func TestParseNVIDIASMIQuery(t *testing.T) {
-	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
+	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
 	byBDF, err := parseNVIDIASMIQuery(raw)
 	if err != nil {
 		t.Fatalf("parse failed: %v", err)
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
 	if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
 		t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
 	}
+	if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
+		t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
+	}
+	if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
+		t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
+	}
 }

 func TestNormalizePCIeBDF(t *testing.T) {
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -59,6 +59,7 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
 		"host bridge",
 		"isa bridge",
 		"pci bridge",
+		"co-processor",
 		"performance counter",
 		"performance counters",
 		"ram memory",
--- a/audit/internal/collector/pcie_filter_test.go
+++ b/audit/internal/collector/pcie_filter_test.go
@@ -19,6 +19,7 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
 		{name: "audio", class: "Audio device", want: false},
 		{name: "host bridge", class: "Host bridge", want: false},
 		{name: "pci bridge", class: "PCI bridge", want: false},
+		{name: "co-processor", class: "Co-processor", want: false},
 		{name: "smbus", class: "SMBus", want: false},
 		{name: "perf", class: "Performance counters", want: false},
 		{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
@@ -76,6 +77,20 @@ func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
 	}
 }

+func TestParseLspci_filtersCoProcessors(t *testing.T) {
+	input := "" +
+		"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
+		"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
+
+	devs := parseLspci(input)
+	if len(devs) != 1 {
+		t.Fatalf("expected 1 remaining device, got %d", len(devs))
+	}
+	if devs[0].Model == nil || *devs[0].Model != "H100" {
+		t.Fatalf("unexpected remaining device: %+v", devs[0])
+	}
+}
+
 func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
 	input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"

--- a/audit/internal/collector/storage.go
+++ b/audit/internal/collector/storage.go
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
 		if dev.Type != "disk" {
 			continue
 		}
+		if isVirtualBMCDisk(dev) {
+			slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
+			continue
+		}
 		disks = append(disks, dev)
 	}
 	return disks
 }

+// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
+// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
+// These have zero reported size, a generic fake serial, and a model name that
+// starts with "Virtual HDisk".
+func isVirtualBMCDisk(dev lsblkDevice) bool {
+	return isVirtualHDiskModel(dev.Model)
+}
+
+func isVirtualHDiskModel(model string) bool {
+	model = strings.ToLower(strings.TrimSpace(model))
+	return strings.HasPrefix(model, "virtual hdisk")
+}
+
 func lsblkDevices() []lsblkDevice {
 	out, err := exec.Command("lsblk", "-J", "-d",
 		"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -0,0 +1,141 @@
+package platform
+
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
+	fmt.Fprintf(&b, "===========================\n\n")
+	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
+	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
+	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
+
+	if len(result.Findings) > 0 {
+		fmt.Fprintf(&b, "Executive Summary\n")
+		fmt.Fprintf(&b, "-----------------\n")
+		for _, finding := range result.Findings {
+			fmt.Fprintf(&b, "- %s\n", finding)
+		}
+		b.WriteString("\n")
+	}
+
+	if len(result.Warnings) > 0 {
+		fmt.Fprintf(&b, "Warnings\n")
+		fmt.Fprintf(&b, "--------\n")
+		for _, warning := range result.Warnings {
+			fmt.Fprintf(&b, "- %s\n", warning)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Per GPU Scorecard\n")
+	fmt.Fprintf(&b, "-----------------\n")
+	for _, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
+		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
+		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
+		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
+		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
+		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
+		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
+		if gpu.Scores.InterconnectScore > 0 {
+			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
+		}
+		if len(gpu.DegradationReasons) > 0 {
+			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
+		}
+		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
+		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
+		if len(gpu.PrecisionResults) > 0 {
+			fmt.Fprintf(&b, "  Precision results:\n")
+			for _, precision := range gpu.PrecisionResults {
+				if precision.Supported {
+					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
+				} else {
+					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
+				}
+			}
+		}
+		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
+			gpu.Throttle.SWPowerCapUS,
+			gpu.Throttle.SWThermalSlowdownUS,
+			gpu.Throttle.SyncBoostUS,
+			gpu.Throttle.HWThermalSlowdownUS,
+			gpu.Throttle.HWPowerBrakeSlowdownUS,
+		)
+		if len(gpu.Notes) > 0 {
+			fmt.Fprintf(&b, "  Notes:\n")
+			for _, note := range gpu.Notes {
+				fmt.Fprintf(&b, "    - %s\n", note)
+			}
+		}
+		b.WriteString("\n")
+	}
+
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "Interconnect\n")
+		fmt.Fprintf(&b, "------------\n")
+		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
+		if result.Interconnect.Supported {
+			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
+			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
+		}
+		for _, note := range result.Interconnect.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Methodology\n")
+	fmt.Fprintf(&b, "-----------\n")
+	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
+	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
+	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
+
+	fmt.Fprintf(&b, "Raw Files\n")
+	fmt.Fprintf(&b, "---------\n")
+	fmt.Fprintf(&b, "- result.json\n")
+	fmt.Fprintf(&b, "- report.txt\n")
+	fmt.Fprintf(&b, "- summary.txt\n")
+	fmt.Fprintf(&b, "- verbose.log\n")
+	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
+	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
+	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
+	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
+	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
+	}
+	return b.String()
+}
+
+func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
+	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
+	fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
+	var best float64
+	for i, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
+		fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
+		if i == 0 || gpu.Scores.CompositeScore > best {
+			best = gpu.Scores.CompositeScore
+		}
+	}
+	fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
+		fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
+	}
+	return b.String()
+}
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -0,0 +1,132 @@
+package platform
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestResolveBenchmarkProfile(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name    string
+		profile string
+		want    benchmarkProfileSpec
+	}{
+		{
+			name:    "default",
+			profile: "",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
+		},
+		{
+			name:    "stability",
+			profile: "stability",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
+		},
+		{
+			name:    "overnight",
+			profile: "overnight",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got := resolveBenchmarkProfile(tc.profile)
+			if got != tc.want {
+				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestParseBenchmarkBurnLog(t *testing.T) {
+	t.Parallel()
+
+	raw := strings.Join([]string{
+		"loader=bee-gpu-burn",
+		"[gpu 0] device=NVIDIA H100",
+		"[gpu 0] compute_capability=9.0",
+		"[gpu 0] backend=cublasLt",
+		"[gpu 0] duration_s=10",
+		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
+		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
+		"[gpu 0] fp16_tensor_iterations=200",
+		"[gpu 0] fp8_e4m3_iterations=50",
+		"[gpu 0] status=OK",
+	}, "\n")
+
+	got := parseBenchmarkBurnLog(raw)
+	if got.Backend != "cublasLt" {
+		t.Fatalf("backend=%q want cublasLt", got.Backend)
+	}
+	if got.ComputeCapability != "9.0" {
+		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
+	}
+	if len(got.Profiles) != 2 {
+		t.Fatalf("profiles=%d want 2", len(got.Profiles))
+	}
+	if got.Profiles[0].TeraOpsPerSec <= 0 {
+		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
+	}
+	if got.Profiles[1].Category != "fp8" {
+		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
+	}
+}
+
+func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
+	t.Parallel()
+
+	result := NvidiaBenchmarkResult{
+		BenchmarkVersion:   benchmarkVersion,
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "PARTIAL",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "partial",
+		},
+		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
+		GPUs: []BenchmarkGPUResult{
+			{
+				Index:  0,
+				Name:   "NVIDIA H100",
+				Status: "OK",
+				Steady: BenchmarkTelemetrySummary{
+					AvgPowerW:           680,
+					AvgTempC:            79,
+					AvgGraphicsClockMHz: 1725,
+					P95PowerW:           700,
+					P95TempC:            82,
+					P95GraphicsClockMHz: 1800,
+				},
+				Scores: BenchmarkScorecard{
+					ComputeScore:        1200,
+					PowerSustainScore:   96,
+					ThermalSustainScore: 88,
+					StabilityScore:      92,
+					CompositeScore:      1176,
+				},
+				PrecisionResults: []BenchmarkPrecisionResult{
+					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
+				},
+				Throttle: BenchmarkThrottleCounters{
+					SWPowerCapUS: 1000000,
+				},
+				DegradationReasons: []string{"power_capped"},
+			},
+		},
+	}
+
+	report := renderBenchmarkReport(result)
+	for _, needle := range []string{
+		"Executive Summary",
+		"GPU 0 spent measurable time under SW power cap.",
+		"Composite score: 1176.00",
+		"fp16_tensor: 700.00 TOPS",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -0,0 +1,132 @@
+package platform
+
+import "time"
+
+const (
+	NvidiaBenchmarkProfileStandard  = "standard"
+	NvidiaBenchmarkProfileStability = "stability"
+	NvidiaBenchmarkProfileOvernight = "overnight"
+)
+
+type NvidiaBenchmarkOptions struct {
+	Profile           string
+	SizeMB            int
+	GPUIndices        []int
+	ExcludeGPUIndices []int
+	RunNCCL           bool
+}
+
+type NvidiaBenchmarkResult struct {
+	BenchmarkVersion   string                       `json:"benchmark_version"`
+	GeneratedAt        time.Time                    `json:"generated_at"`
+	Hostname           string                       `json:"hostname,omitempty"`
+	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	OverallStatus      string                       `json:"overall_status"`
+	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
+	Findings           []string                     `json:"findings,omitempty"`
+	Warnings           []string                     `json:"warnings,omitempty"`
+	Normalization      BenchmarkNormalization       `json:"normalization"`
+	GPUs               []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+}
+
+type BenchmarkNormalization struct {
+	Status string                      `json:"status"`
+	Notes  []string                    `json:"notes,omitempty"`
+	GPUs   []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
+}
+
+type BenchmarkNormalizationGPU struct {
+	Index                 int      `json:"index"`
+	PersistenceMode       string   `json:"persistence_mode,omitempty"`
+	GPUClockLockMHz       float64  `json:"gpu_clock_lock_mhz,omitempty"`
+	GPUClockLockStatus    string   `json:"gpu_clock_lock_status,omitempty"`
+	MemoryClockLockMHz    float64  `json:"memory_clock_lock_mhz,omitempty"`
+	MemoryClockLockStatus string   `json:"memory_clock_lock_status,omitempty"`
+	Notes                 []string `json:"notes,omitempty"`
+}
+
+type BenchmarkGPUResult struct {
+	Index                  int                        `json:"index"`
+	UUID                   string                     `json:"uuid,omitempty"`
+	Name                   string                     `json:"name,omitempty"`
+	BusID                  string                     `json:"bus_id,omitempty"`
+	VBIOS                  string                     `json:"vbios,omitempty"`
+	ComputeCapability      string                     `json:"compute_capability,omitempty"`
+	Backend                string                     `json:"backend,omitempty"`
+	Status                 string                     `json:"status"`
+	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
+	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
+	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
+	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
+	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
+	Baseline               BenchmarkTelemetrySummary  `json:"baseline"`
+	Steady                 BenchmarkTelemetrySummary  `json:"steady"`
+	Cooldown               BenchmarkTelemetrySummary  `json:"cooldown"`
+	Throttle               BenchmarkThrottleCounters  `json:"throttle_counters"`
+	PrecisionResults       []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
+	Scores                 BenchmarkScorecard         `json:"scores"`
+	DegradationReasons     []string                   `json:"degradation_reasons,omitempty"`
+	Notes                  []string                   `json:"notes,omitempty"`
+}
+
+type BenchmarkTelemetrySummary struct {
+	DurationSec         float64 `json:"duration_sec"`
+	Samples             int     `json:"samples"`
+	AvgTempC            float64 `json:"avg_temp_c"`
+	P95TempC            float64 `json:"p95_temp_c"`
+	AvgPowerW           float64 `json:"avg_power_w"`
+	P95PowerW           float64 `json:"p95_power_w"`
+	AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
+	P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
+	AvgMemoryClockMHz   float64 `json:"avg_memory_clock_mhz"`
+	P95MemoryClockMHz   float64 `json:"p95_memory_clock_mhz"`
+	AvgUsagePct         float64 `json:"avg_usage_pct"`
+	AvgMemUsagePct      float64 `json:"avg_mem_usage_pct"`
+	ClockCVPct          float64 `json:"clock_cv_pct"`
+	PowerCVPct          float64 `json:"power_cv_pct"`
+	TempCVPct           float64 `json:"temp_cv_pct"`
+	ClockDriftPct       float64 `json:"clock_drift_pct"`
+}
+
+type BenchmarkThrottleCounters struct {
+	SWPowerCapUS           uint64 `json:"sw_power_cap_us"`
+	SWThermalSlowdownUS    uint64 `json:"sw_thermal_slowdown_us"`
+	SyncBoostUS            uint64 `json:"sync_boost_us"`
+	HWThermalSlowdownUS    uint64 `json:"hw_thermal_slowdown_us"`
+	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
+}
+
+type BenchmarkPrecisionResult struct {
+	Name          string  `json:"name"`
+	Category      string  `json:"category"`
+	Supported     bool    `json:"supported"`
+	Lanes         int     `json:"lanes,omitempty"`
+	M             uint64  `json:"m,omitempty"`
+	N             uint64  `json:"n,omitempty"`
+	K             uint64  `json:"k,omitempty"`
+	Iterations    uint64  `json:"iterations,omitempty"`
+	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
+	Notes         string  `json:"notes,omitempty"`
+}
+
+type BenchmarkScorecard struct {
+	ComputeScore        float64 `json:"compute_score"`
+	PowerSustainScore   float64 `json:"power_sustain_score"`
+	ThermalSustainScore float64 `json:"thermal_sustain_score"`
+	StabilityScore      float64 `json:"stability_score"`
+	InterconnectScore   float64 `json:"interconnect_score"`
+	CompositeScore      float64 `json:"composite_score"`
+}
+
+type BenchmarkInterconnectResult struct {
+	Status             string   `json:"status"`
+	Attempted          bool     `json:"attempted"`
+	Supported          bool     `json:"supported"`
+	SelectedGPUIndices []int    `json:"selected_gpu_indices,omitempty"`
+	AvgAlgBWGBps       float64  `json:"avg_algbw_gbps,omitempty"`
+	MaxAlgBWGBps       float64  `json:"max_algbw_gbps,omitempty"`
+	AvgBusBWGBps       float64  `json:"avg_busbw_gbps,omitempty"`
+	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
+	Notes              []string `json:"notes,omitempty"`
+}
--- a/audit/internal/platform/error_patterns.go
+++ b/audit/internal/platform/error_patterns.go
@@ -0,0 +1,139 @@
+package platform
+
+import "regexp"
+
+// ErrorPattern describes a kernel log pattern that indicates a hardware error.
+// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
+type ErrorPattern struct {
+	// Name is a short machine-readable label for logging and deduplication.
+	Name string
+	// Re is the compiled regular expression matched against a single kmsg line.
+	Re *regexp.Regexp
+	// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
+	Category string
+	// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
+	Severity string
+	// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
+	// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
+	BDFGroup int
+	// DevGroup is the capture group index (1-based) that contains a device name
+	// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
+	DevGroup int
+}
+
+// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
+// To add a new pattern: append a new ErrorPattern struct to this slice.
+var HardwareErrorPatterns = []ErrorPattern{
+	// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
+	{
+		Name:     "nvidia-rminitadapter",
+		Re:       mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "nvidia-msi-fail",
+		Re:       mustPat(`(?i)NVRM:.*Failed to enable MSI`),
+		Category: "gpu",
+		Severity: "warning",
+	},
+	{
+		Name:     "nvidia-aer",
+		Re:       mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "nvidia-xid",
+		Re:       mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+
+	// ── PCIe AER (generic) ──────────────────────────────────────────────────────
+	{
+		Name:     "pcie-aer",
+		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "pcie-uncorrectable",
+		Re:       mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "pcie-link-down",
+		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+
+	// ── Storage ─────────────────────────────────────────────────────────────────
+	{
+		Name:     "blk-io-error",
+		Re:       mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+	{
+		Name:     "nvme-timeout",
+		Re:       mustPat(`(?i)nvme\s+(\w+):.*timeout`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+	{
+		Name:     "scsi-failed",
+		Re:       mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
+		Category: "storage",
+		Severity: "warning",
+	},
+	{
+		Name:     "nvme-reset",
+		Re:       mustPat(`(?i)nvme\s+(\w+):.*reset`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+
+	// ── Machine Check Exceptions ────────────────────────────────────────────────
+	{
+		Name:     "mce-hardware-error",
+		Re:       mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
+		Category: "mce",
+		Severity: "warning",
+	},
+	{
+		Name:     "mce-corrected",
+		Re:       mustPat(`(?i)mce:.*[Cc]orrected`),
+		Category: "mce",
+		Severity: "warning",
+	},
+
+	// ── Memory ─────────────────────────────────────────────────────────────────
+	{
+		Name:     "edac-ue",
+		Re:       mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
+		Category: "memory",
+		Severity: "warning",
+	},
+	{
+		Name:     "edac-ce",
+		Re:       mustPat(`(?i)EDAC.*[Cc]orrectable`),
+		Category: "memory",
+		Severity: "warning",
+	},
+}
+
+func mustPat(s string) *regexp.Regexp {
+	return regexp.MustCompile(s)
+}
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -20,12 +20,13 @@ type GPUMetricRow struct {
 	MemUsagePct float64 `json:"mem_usage_pct"`
 	PowerW      float64 `json:"power_w"`
 	ClockMHz    float64 `json:"clock_mhz"`
+	MemClockMHz float64 `json:"mem_clock_mhz"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
 func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	args := []string{
-		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
+		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
 		"--format=csv,noheader,nounits",
 	}
 	if len(gpuIndices) > 0 {
@@ -46,7 +47,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			continue
 		}
 		parts := strings.Split(line, ", ")
-		if len(parts) < 6 {
+		if len(parts) < 7 {
 			continue
 		}
 		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -57,6 +58,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			MemUsagePct: parseGPUFloat(parts[3]),
 			PowerW:      parseGPUFloat(parts[4]),
 			ClockMHz:    parseGPUFloat(parts[5]),
+			MemClockMHz: parseGPUFloat(parts[6]),
 		})
 	}
 	return rows, nil
@@ -76,13 +78,73 @@ func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	return sampleGPUMetrics(gpuIndices)
 }

+// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
+func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
+	out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
+	if err != nil {
+		return nil, err
+	}
+	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
+	if len(lines) < 2 {
+		return nil, fmt.Errorf("rocm-smi: insufficient output")
+	}
+
+	// Parse header to find column indices by name.
+	headers := strings.Split(lines[0], ",")
+	colIdx := func(keywords ...string) int {
+		for i, h := range headers {
+			hl := strings.ToLower(strings.TrimSpace(h))
+			for _, kw := range keywords {
+				if strings.Contains(hl, kw) {
+					return i
+				}
+			}
+		}
+		return -1
+	}
+	idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
+	idxUse := colIdx("gpu use (%)")
+	idxMem := colIdx("vram%", "memory allocated")
+	idxPow := colIdx("average graphics package power", "power (w)")
+
+	var rows []GPUMetricRow
+	for _, line := range lines[1:] {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		parts := strings.Split(line, ",")
+		idx := len(rows)
+		row := GPUMetricRow{GPUIndex: idx}
+		get := func(i int) float64 {
+			if i < 0 || i >= len(parts) {
+				return 0
+			}
+			v := strings.TrimSpace(parts[i])
+			if strings.EqualFold(v, "n/a") {
+				return 0
+			}
+			return parseGPUFloat(v)
+		}
+		row.TempC = get(idxTemp)
+		row.UsagePct = get(idxUse)
+		row.MemUsagePct = get(idxMem)
+		row.PowerW = get(idxPow)
+		rows = append(rows, row)
+	}
+	if len(rows) == 0 {
+		return nil, fmt.Errorf("rocm-smi: no GPU rows parsed")
+	}
+	return rows, nil
+}
+
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
+	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
+		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
+			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
@@ -137,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	const PW = plotX2 - plotX1
 	const PH = plotY2 - plotY1
 	// Outer axes
-	const tempAxisX = 60  // temp axis line
+	const tempAxisX = 60   // temp axis line
 	const clockAxisX = 900 // clock axis line

 	colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
--- a/audit/internal/platform/install.go
+++ b/audit/internal/platform/install.go
@@ -11,10 +11,10 @@ import (

 // InstallDisk describes a candidate disk for installation.
 type InstallDisk struct {
-	Device      string   // e.g. /dev/sda
-	Model       string
-	Size        string   // human-readable, e.g. "500G"
-	SizeBytes   int64    // raw byte count from lsblk
+	Device       string // e.g. /dev/sda
+	Model        string
+	Size         string   // human-readable, e.g. "500G"
+	SizeBytes    int64    // raw byte count from lsblk
 	MountedParts []string // partition mount points currently active
 }

@@ -117,6 +117,61 @@ func findLiveBootDevice() string {
 	return "/dev/" + strings.TrimSpace(string(out2))
 }

+func mountSource(target string) string {
+	out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func mountFSType(target string) string {
+	out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func blockDeviceType(device string) string {
+	if strings.TrimSpace(device) == "" {
+		return ""
+	}
+	out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func blockDeviceTransport(device string) string {
+	if strings.TrimSpace(device) == "" {
+		return ""
+	}
+	out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func inferLiveBootKind(fsType, source, deviceType, transport string) string {
+	switch {
+	case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
+		return "ram"
+	case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
+		return "cdrom"
+	case strings.EqualFold(strings.TrimSpace(transport), "usb"):
+		return "usb"
+	case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
+		return "cdrom"
+	case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
+		return "disk"
+	default:
+		return "unknown"
+	}
+}
+
 // MinInstallBytes returns the minimum recommended disk size for installation:
 // squashfs size × 1.5 to allow for extracted filesystem and bootloader.
 // Returns 0 if the squashfs is not available (non-live environment).
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -0,0 +1,220 @@
+package platform
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+)
+
+func (s *System) IsLiveMediaInRAM() bool {
+	fsType := mountFSType("/run/live/medium")
+	if fsType == "" {
+		return toramActive()
+	}
+	return strings.EqualFold(fsType, "tmpfs")
+}
+
+func (s *System) LiveBootSource() LiveBootSource {
+	fsType := mountFSType("/run/live/medium")
+	source := mountSource("/run/live/medium")
+	device := findLiveBootDevice()
+	status := LiveBootSource{
+		InRAM:  strings.EqualFold(fsType, "tmpfs"),
+		Source: source,
+		Device: device,
+	}
+	if fsType == "" && source == "" && device == "" {
+		if toramActive() {
+			status.InRAM = true
+			status.Kind = "ram"
+			status.Source = "tmpfs"
+			return status
+		}
+		status.Kind = "unknown"
+		return status
+	}
+	status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
+	if status.Kind == "" {
+		status.Kind = "unknown"
+	}
+	if status.InRAM && strings.TrimSpace(status.Source) == "" {
+		status.Source = "tmpfs"
+	}
+	return status
+}
+
+func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
+	log := func(msg string) {
+		if logFunc != nil {
+			logFunc(msg)
+		}
+	}
+
+	if s.IsLiveMediaInRAM() {
+		log("Already running from RAM — installation media can be safely disconnected.")
+		return nil
+	}
+
+	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
+	if err != nil || len(squashfsFiles) == 0 {
+		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
+	}
+
+	free := freeMemBytes()
+	var needed int64
+	for _, sf := range squashfsFiles {
+		fi, err2 := os.Stat(sf)
+		if err2 != nil {
+			return fmt.Errorf("stat %s: %v", sf, err2)
+		}
+		needed += fi.Size()
+	}
+	const headroom = 256 * 1024 * 1024
+	if free > 0 && needed+headroom > free {
+		return fmt.Errorf("insufficient RAM: need %s, available %s",
+			humanBytes(needed+headroom), humanBytes(free))
+	}
+
+	dstDir := "/dev/shm/bee-live"
+	if err := os.MkdirAll(dstDir, 0755); err != nil {
+		return fmt.Errorf("create tmpfs dir: %v", err)
+	}
+
+	for _, sf := range squashfsFiles {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		base := filepath.Base(sf)
+		dst := filepath.Join(dstDir, base)
+		log(fmt.Sprintf("Copying %s to RAM...", base))
+		if err := copyFileLarge(ctx, sf, dst, log); err != nil {
+			return fmt.Errorf("copy %s: %v", base, err)
+		}
+		log(fmt.Sprintf("Copied %s.", base))
+
+		loopDev, err := findLoopForFile(sf)
+		if err != nil {
+			log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err))
+			continue
+		}
+		if err := reassociateLoopDevice(loopDev, dst); err != nil {
+			log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err))
+		} else {
+			log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
+		}
+	}
+
+	log("Copying remaining medium files...")
+	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
+		log(fmt.Sprintf("Warning: partial copy: %v", err))
+	}
+	if err := ctx.Err(); err != nil {
+		return err
+	}
+	if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
+		log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
+	}
+
+	log("Done. Installation media can be safely disconnected.")
+	return nil
+}
+
+func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+	fi, err := in.Stat()
+	if err != nil {
+		return err
+	}
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+	total := fi.Size()
+	var copied int64
+	buf := make([]byte, 4*1024*1024)
+	for {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		n, err := in.Read(buf)
+		if n > 0 {
+			if _, werr := out.Write(buf[:n]); werr != nil {
+				return werr
+			}
+			copied += int64(n)
+			if logFunc != nil && total > 0 {
+				pct := int(float64(copied) / float64(total) * 100)
+				logFunc(fmt.Sprintf("  %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
+			}
+		}
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+	}
+	return out.Sync()
+}
+
+func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
+	return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+		if err != nil {
+			return nil
+		}
+		rel, _ := filepath.Rel(src, path)
+		target := filepath.Join(dst, rel)
+		if fi.IsDir() {
+			return os.MkdirAll(target, fi.Mode())
+		}
+		if strings.HasSuffix(path, ".squashfs") {
+			return nil
+		}
+		if _, err := os.Stat(target); err == nil {
+			return nil
+		}
+		return copyFileLarge(ctx, path, target, nil)
+	})
+}
+
+func findLoopForFile(backingFile string) (string, error) {
+	out, err := exec.Command("losetup", "--list", "--json").Output()
+	if err != nil {
+		return "", err
+	}
+	var result struct {
+		Loopdevices []struct {
+			Name     string `json:"name"`
+			BackFile string `json:"back-file"`
+		} `json:"loopdevices"`
+	}
+	if err := json.Unmarshal(out, &result); err != nil {
+		return "", err
+	}
+	for _, dev := range result.Loopdevices {
+		if dev.BackFile == backingFile {
+			return dev.Name, nil
+		}
+	}
+	return "", fmt.Errorf("no loop device found for %s", backingFile)
+}
+
+func reassociateLoopDevice(loopDev, newFile string) error {
+	if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
+		return nil
+	}
+	return loopChangeFD(loopDev, newFile)
+}
--- a/audit/internal/platform/install_to_ram_linux.go
+++ b/audit/internal/platform/install_to_ram_linux.go
@@ -0,0 +1,28 @@
+//go:build linux
+
+package platform
+
+import (
+	"os"
+	"syscall"
+)
+
+const ioctlLoopChangeFD = 0x4C08
+
+func loopChangeFD(loopDev, newFile string) error {
+	lf, err := os.OpenFile(loopDev, os.O_RDWR, 0)
+	if err != nil {
+		return err
+	}
+	defer lf.Close()
+	nf, err := os.OpenFile(newFile, os.O_RDONLY, 0)
+	if err != nil {
+		return err
+	}
+	defer nf.Close()
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd())
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
--- a/audit/internal/platform/install_to_ram_other.go
+++ b/audit/internal/platform/install_to_ram_other.go
@@ -0,0 +1,9 @@
+//go:build !linux
+
+package platform
+
+import "errors"
+
+func loopChangeFD(loopDev, newFile string) error {
+	return errors.New("LOOP_CHANGE_FD not available on this platform")
+}
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -0,0 +1,28 @@
+package platform
+
+import "testing"
+
+func TestInferLiveBootKind(t *testing.T) {
+	tests := []struct {
+		name       string
+		fsType     string
+		source     string
+		deviceType string
+		transport  string
+		want       string
+	}{
+		{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
+		{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
+		{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
+		{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
+		{name: "unknown", source: "overlay", want: "unknown"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
+			if got != tc.want {
+				t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
+			}
+		})
+	}
+}
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -0,0 +1,64 @@
+package platform
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"syscall"
+)
+
+// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
+// bee test worker processes that should be killed by KillTestWorkers.
+var workerPatterns = []string{
+	"bee-gpu-burn",
+	"stress-ng",
+	"stressapptest",
+	"memtester",
+}
+
+// KilledProcess describes a process that was sent SIGKILL.
+type KilledProcess struct {
+	PID  int    `json:"pid"`
+	Name string `json:"name"`
+}
+
+// KillTestWorkers scans /proc for running test worker processes and sends
+// SIGKILL to each one found. It returns a list of killed processes.
+// Errors for individual processes (e.g. already exited) are silently ignored.
+func KillTestWorkers() []KilledProcess {
+	entries, err := os.ReadDir("/proc")
+	if err != nil {
+		return nil
+	}
+
+	var killed []KilledProcess
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		pid, err := strconv.Atoi(e.Name())
+		if err != nil {
+			continue
+		}
+		cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
+		if err != nil {
+			continue
+		}
+		// /proc/*/cmdline uses NUL bytes as argument separators.
+		args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
+		exe := strings.TrimSpace(args[0])
+		base := exe
+		if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
+			base = exe[idx+1:]
+		}
+		for _, pat := range workerPatterns {
+			if strings.Contains(base, pat) || strings.Contains(exe, pat) {
+				_ = syscall.Kill(pid, syscall.SIGKILL)
+				killed = append(killed, KilledProcess{PID: pid, Name: base})
+				break
+			}
+		}
+	}
+	return killed
+}
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -2,7 +2,10 @@ package platform

 import (
 	"bufio"
+	"encoding/json"
 	"os"
+	"os/exec"
+	"sort"
 	"strconv"
 	"strings"
 	"time"
@@ -23,6 +26,7 @@ type LiveMetricSample struct {
 // TempReading is a named temperature sensor value.
 type TempReading struct {
 	Name    string  `json:"name"`
+	Group   string  `json:"group,omitempty"`
 	Celsius float64 `json:"celsius"`
 }

@@ -32,18 +36,22 @@ type TempReading struct {
 func SampleLiveMetrics() LiveMetricSample {
 	s := LiveMetricSample{Timestamp: time.Now().UTC()}

-	// GPU metrics — skipped silently if nvidia-smi unavailable
-	gpus, _ := SampleGPUMetrics(nil)
-	s.GPUs = gpus
+	// GPU metrics — try NVIDIA first, fall back to AMD
+	if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
+		s.GPUs = gpus
+	} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
+		s.GPUs = amdGPUs
+	}

 	// Fan speeds — skipped silently if ipmitool unavailable
 	fans, _ := sampleFanSpeeds()
 	s.Fans = fans

-	// CPU/system temperature — returns 0 if unavailable
-	cpuTemp := sampleCPUMaxTemp()
-	if cpuTemp > 0 {
-		s.Temps = append(s.Temps, TempReading{Name: "CPU", Celsius: cpuTemp})
+	s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
+	if !hasTempGroup(s.Temps, "cpu") {
+		if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
+			s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
+		}
 	}

 	// System power — returns 0 if unavailable
@@ -60,18 +68,20 @@ func SampleLiveMetrics() LiveMetricSample {

 // sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
 // the overall CPU utilisation percentage.
-var cpuStatPrev [2]uint64 // [total, idle]
-
 func sampleCPULoadPct() float64 {
-	total, idle := readCPUStat()
-	if total == 0 {
+	total0, idle0 := readCPUStat()
+	if total0 == 0 {
 		return 0
 	}
-	prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
-	cpuStatPrev = [2]uint64{total, idle}
-	if prevTotal == 0 {
+	time.Sleep(200 * time.Millisecond)
+	total1, idle1 := readCPUStat()
+	if total1 == 0 {
 		return 0
 	}
+	return cpuLoadPctBetween(total0, idle0, total1, idle1)
+}
+
+func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
 	dt := float64(total - prevTotal)
 	di := float64(idle - prevIdle)
 	if dt <= 0 {
@@ -137,3 +147,182 @@ func sampleMemLoadPct() float64 {
 	used := total - avail
 	return float64(used) / float64(total) * 100
 }
+
+func hasTempGroup(temps []TempReading, group string) bool {
+	for _, t := range temps {
+		if t.Group == group {
+			return true
+		}
+	}
+	return false
+}
+
+func sampleLiveTemperatureReadings() []TempReading {
+	if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
+		return temps
+	}
+	return sampleLiveTempsViaIPMI()
+}
+
+func sampleLiveTempsViaSensorsJSON() []TempReading {
+	out, err := exec.Command("sensors", "-j").Output()
+	if err != nil || len(out) == 0 {
+		return nil
+	}
+
+	var doc map[string]map[string]any
+	if err := json.Unmarshal(out, &doc); err != nil {
+		return nil
+	}
+
+	chips := make([]string, 0, len(doc))
+	for chip := range doc {
+		chips = append(chips, chip)
+	}
+	sort.Strings(chips)
+
+	temps := make([]TempReading, 0, len(chips))
+	seen := map[string]struct{}{}
+	for _, chip := range chips {
+		features := doc[chip]
+		featureNames := make([]string, 0, len(features))
+		for name := range features {
+			featureNames = append(featureNames, name)
+		}
+		sort.Strings(featureNames)
+		for _, name := range featureNames {
+			if strings.EqualFold(name, "Adapter") {
+				continue
+			}
+			feature, ok := features[name].(map[string]any)
+			if !ok {
+				continue
+			}
+			value, ok := firstTempInputValue(feature)
+			if !ok || value <= 0 || value > 150 {
+				continue
+			}
+			group := classifyLiveTempGroup(chip, name)
+			if group == "gpu" {
+				continue
+			}
+			label := strings.TrimSpace(name)
+			if label == "" {
+				continue
+			}
+			if group == "ambient" {
+				label = compactAmbientTempName(chip, label)
+			}
+			key := group + "\x00" + label
+			if _, ok := seen[key]; ok {
+				continue
+			}
+			seen[key] = struct{}{}
+			temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
+		}
+	}
+	return temps
+}
+
+func sampleLiveTempsViaIPMI() []TempReading {
+	out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
+	if err != nil || len(out) == 0 {
+		return nil
+	}
+	var temps []TempReading
+	seen := map[string]struct{}{}
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		parts := strings.Split(line, "|")
+		if len(parts) < 3 {
+			continue
+		}
+		name := strings.TrimSpace(parts[0])
+		if name == "" {
+			continue
+		}
+		unit := strings.ToLower(strings.TrimSpace(parts[2]))
+		if !strings.Contains(unit, "degrees") {
+			continue
+		}
+		raw := strings.TrimSpace(parts[1])
+		if raw == "" || strings.EqualFold(raw, "na") {
+			continue
+		}
+		value, err := strconv.ParseFloat(raw, 64)
+		if err != nil || value <= 0 || value > 150 {
+			continue
+		}
+		group := classifyLiveTempGroup("", name)
+		if group == "gpu" {
+			continue
+		}
+		label := name
+		if group == "ambient" {
+			label = compactAmbientTempName("", label)
+		}
+		key := group + "\x00" + label
+		if _, ok := seen[key]; ok {
+			continue
+		}
+		seen[key] = struct{}{}
+		temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
+	}
+	return temps
+}
+
+func firstTempInputValue(feature map[string]any) (float64, bool) {
+	keys := make([]string, 0, len(feature))
+	for key := range feature {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		lower := strings.ToLower(key)
+		if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
+			continue
+		}
+		switch value := feature[key].(type) {
+		case float64:
+			return value, true
+		case string:
+			f, err := strconv.ParseFloat(value, 64)
+			if err == nil {
+				return f, true
+			}
+		}
+	}
+	return 0, false
+}
+
+func classifyLiveTempGroup(chip, name string) string {
+	text := strings.ToLower(strings.TrimSpace(chip + " " + name))
+	switch {
+	case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
+		return "gpu"
+	case strings.Contains(text, "coretemp"),
+		strings.Contains(text, "k10temp"),
+		strings.Contains(text, "zenpower"),
+		strings.Contains(text, "package id"),
+		strings.Contains(text, "x86_pkg_temp"),
+		strings.Contains(text, "tctl"),
+		strings.Contains(text, "tdie"),
+		strings.Contains(text, "tccd"),
+		strings.Contains(text, "cpu"),
+		strings.Contains(text, "peci"):
+		return "cpu"
+	default:
+		return "ambient"
+	}
+}
+
+func compactAmbientTempName(chip, name string) string {
+	chip = strings.TrimSpace(chip)
+	name = strings.TrimSpace(name)
+	if chip == "" || strings.EqualFold(chip, name) {
+		return name
+	}
+	if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
+		return name
+	}
+	return chip + " / " + name
+}
--- a/audit/internal/platform/live_metrics_test.go
+++ b/audit/internal/platform/live_metrics_test.go
@@ -0,0 +1,94 @@
+package platform
+
+import "testing"
+
+func TestFirstTempInputValue(t *testing.T) {
+	feature := map[string]any{
+		"temp1_input": 61.5,
+		"temp1_max":   80.0,
+	}
+	got, ok := firstTempInputValue(feature)
+	if !ok {
+		t.Fatal("expected value")
+	}
+	if got != 61.5 {
+		t.Fatalf("got %v want 61.5", got)
+	}
+}
+
+func TestClassifyLiveTempGroup(t *testing.T) {
+	tests := []struct {
+		chip string
+		name string
+		want string
+	}{
+		{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
+		{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
+		{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
+		{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
+	}
+	for _, tc := range tests {
+		if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
+			t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
+		}
+	}
+}
+
+func TestCompactAmbientTempName(t *testing.T) {
+	if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
+		t.Fatalf("got %q", got)
+	}
+	if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
+		t.Fatalf("got %q", got)
+	}
+}
+
+func TestCPULoadPctBetween(t *testing.T) {
+	tests := []struct {
+		name      string
+		prevTotal uint64
+		prevIdle  uint64
+		total     uint64
+		idle      uint64
+		want      float64
+	}{
+		{
+			name:      "busy half",
+			prevTotal: 100,
+			prevIdle:  40,
+			total:     200,
+			idle:      90,
+			want:      50,
+		},
+		{
+			name:      "fully busy",
+			prevTotal: 100,
+			prevIdle:  40,
+			total:     200,
+			idle:      40,
+			want:      100,
+		},
+		{
+			name:      "no progress",
+			prevTotal: 100,
+			prevIdle:  40,
+			total:     100,
+			idle:      40,
+			want:      0,
+		},
+		{
+			name:      "idle delta larger than total clamps to zero",
+			prevTotal: 100,
+			prevIdle:  40,
+			total:     200,
+			idle:      150,
+			want:      0,
+		},
+	}
+
+	for _, tc := range tests {
+		if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
+			t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
+		}
+	}
+}
--- a/audit/internal/platform/network.go
+++ b/audit/internal/platform/network.go
@@ -2,6 +2,7 @@ package platform

 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"os"
 	"os/exec"
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
 	out := make([]InterfaceInfo, 0, len(names))
 	for _, name := range names {
 		state := "unknown"
-		if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
-			fields := strings.Fields(string(raw))
-			if len(fields) >= 9 {
-				state = fields[8]
+		if up, err := interfaceAdminState(name); err == nil {
+			if up {
+				state = "up"
+			} else {
+				state = "down"
 			}
 		}

-		var ipv4 []string
-		if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
-			for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
-				fields := strings.Fields(line)
-				if len(fields) >= 4 {
-					ipv4 = append(ipv4, fields[3])
-				}
-			}
+		ipv4, err := interfaceIPv4Addrs(name)
+		if err != nil {
+			ipv4 = nil
 		}

 		out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
@@ -55,6 +52,119 @@ func (s *System) DefaultRoute() string {
 	return ""
 }

+func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
+	names, err := listInterfaceNames()
+	if err != nil {
+		return NetworkSnapshot{}, err
+	}
+
+	snapshot := NetworkSnapshot{
+		Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
+	}
+	for _, name := range names {
+		up, err := interfaceAdminState(name)
+		if err != nil {
+			return NetworkSnapshot{}, err
+		}
+		ipv4, err := interfaceIPv4Addrs(name)
+		if err != nil {
+			return NetworkSnapshot{}, err
+		}
+		snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
+			Name: name,
+			Up:   up,
+			IPv4: ipv4,
+		})
+	}
+
+	if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
+		for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
+			line = strings.TrimSpace(line)
+			if line != "" {
+				snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
+			}
+		}
+	}
+
+	if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
+		snapshot.ResolvConf = string(raw)
+	}
+
+	return snapshot, nil
+}
+
+func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
+	var errs []string
+
+	for _, iface := range snapshot.Interfaces {
+		if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
+			errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
+			continue
+		}
+		if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
+			errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
+		}
+		for _, cidr := range iface.IPv4 {
+			if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
+				detail := strings.TrimSpace(string(raw))
+				if detail != "" {
+					errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
+				} else {
+					errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
+				}
+			}
+		}
+		state := "down"
+		if iface.Up {
+			state = "up"
+		}
+		if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
+			errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
+		}
+	}
+
+	if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
+		var exitErr *exec.ExitError
+		if !errors.As(err, &exitErr) {
+			errs = append(errs, fmt.Sprintf("clear default route: %v", err))
+		}
+	}
+	for _, route := range snapshot.DefaultRoutes {
+		fields := strings.Fields(route)
+		if len(fields) == 0 {
+			continue
+		}
+		// Strip state flags that ip-route(8) does not accept as add arguments.
+		filtered := fields[:0]
+		for _, f := range fields {
+			switch f {
+			case "linkdown", "dead", "onlink", "pervasive":
+				// skip
+			default:
+				filtered = append(filtered, f)
+			}
+		}
+		args := append([]string{"route", "add"}, filtered...)
+		if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
+			detail := strings.TrimSpace(string(raw))
+			if detail != "" {
+				errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
+			} else {
+				errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
+			}
+		}
+	}
+
+	if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
+		errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
+	}
+
+	if len(errs) > 0 {
+		return errors.New(strings.Join(errs, "; "))
+	}
+	return nil
+}
+
 func (s *System) DHCPOne(iface string) (string, error) {
 	var out bytes.Buffer
 	if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
@@ -131,6 +241,65 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
 	return out.String(), nil
 }

+// SetInterfaceState brings a network interface up or down.
+func (s *System) SetInterfaceState(iface string, up bool) error {
+	state := "down"
+	if up {
+		state = "up"
+	}
+	return exec.Command("ip", "link", "set", "dev", iface, state).Run()
+}
+
+// GetInterfaceState returns true if the interface is UP.
+func (s *System) GetInterfaceState(iface string) (bool, error) {
+	return interfaceAdminState(iface)
+}
+
+func interfaceAdminState(iface string) (bool, error) {
+	raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
+	if err != nil {
+		return false, err
+	}
+	return parseInterfaceAdminState(string(raw))
+}
+
+func parseInterfaceAdminState(raw string) (bool, error) {
+	start := strings.IndexByte(raw, '<')
+	if start == -1 {
+		return false, fmt.Errorf("ip link output missing flags")
+	}
+	end := strings.IndexByte(raw[start+1:], '>')
+	if end == -1 {
+		return false, fmt.Errorf("ip link output missing flag terminator")
+	}
+	flags := strings.Split(raw[start+1:start+1+end], ",")
+	for _, flag := range flags {
+		if strings.TrimSpace(flag) == "UP" {
+			return true, nil
+		}
+	}
+	return false, nil
+}
+
+func interfaceIPv4Addrs(iface string) ([]string, error) {
+	raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
+	if err != nil {
+		var exitErr *exec.ExitError
+		if errors.As(err, &exitErr) {
+			return nil, nil
+		}
+		return nil, err
+	}
+	var ipv4 []string
+	for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
+		fields := strings.Fields(line)
+		if len(fields) >= 4 {
+			ipv4 = append(ipv4, fields[3])
+		}
+	}
+	return ipv4, nil
+}
+
 func listInterfaceNames() ([]string, error) {
 	raw, err := exec.Command("ip", "-o", "link", "show").Output()
 	if err != nil {
--- a/audit/internal/platform/network_test.go
+++ b/audit/internal/platform/network_test.go
@@ -0,0 +1,46 @@
+package platform
+
+import "testing"
+
+func TestParseInterfaceAdminState(t *testing.T) {
+	tests := []struct {
+		name    string
+		raw     string
+		want    bool
+		wantErr bool
+	}{
+		{
+			name: "admin up with no carrier",
+			raw:  "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
+			want: true,
+		},
+		{
+			name: "admin down",
+			raw:  "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
+			want: false,
+		},
+		{
+			name:    "malformed output",
+			raw:     "2: enp1s0: mtu 1500 state DOWN\n",
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := parseInterfaceAdminState(tt.raw)
+			if tt.wantErr {
+				if err == nil {
+					t.Fatal("expected error")
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if got != tt.want {
+				t.Fatalf("got %v want %v", got, tt.want)
+			}
+		})
+	}
+}
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -0,0 +1,203 @@
+package platform
+
+import (
+	"context"
+	"fmt"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
+	normalizeNvidiaStressOptions(&opts)
+
+	job, err := buildNvidiaStressJob(opts)
+	if err != nil {
+		return "", err
+	}
+
+	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
+		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
+		job,
+		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	}, logFunc)
+}
+
+func nvidiaStressArchivePrefix(loader string) string {
+	switch strings.TrimSpace(strings.ToLower(loader)) {
+	case NvidiaStressLoaderJohn:
+		return "gpu-nvidia-john"
+	case NvidiaStressLoaderNCCL:
+		return "gpu-nvidia-nccl"
+	default:
+		return "gpu-nvidia-burn"
+	}
+}
+
+func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
+	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
+	if err != nil {
+		return satJob{}, err
+	}
+
+	loader := strings.TrimSpace(strings.ToLower(opts.Loader))
+	switch loader {
+	case "", NvidiaStressLoaderBuiltin:
+		cmd := []string{
+			"bee-gpu-burn",
+			"--seconds", strconv.Itoa(opts.DurationSec),
+			"--size-mb", strconv.Itoa(opts.SizeMB),
+		}
+		if len(selected) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(selected))
+		}
+		return satJob{
+			name:       "03-bee-gpu-burn.log",
+			cmd:        cmd,
+			collectGPU: true,
+			gpuIndices: selected,
+		}, nil
+	case NvidiaStressLoaderJohn:
+		cmd := []string{
+			"bee-john-gpu-stress",
+			"--seconds", strconv.Itoa(opts.DurationSec),
+		}
+		if len(selected) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(selected))
+		}
+		return satJob{
+			name:       "03-john-gpu-stress.log",
+			cmd:        cmd,
+			collectGPU: true,
+			gpuIndices: selected,
+		}, nil
+	case NvidiaStressLoaderNCCL:
+		cmd := []string{
+			"bee-nccl-gpu-stress",
+			"--seconds", strconv.Itoa(opts.DurationSec),
+		}
+		if len(selected) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(selected))
+		}
+		return satJob{
+			name:       "03-bee-nccl-gpu-stress.log",
+			cmd:        cmd,
+			collectGPU: true,
+			gpuIndices: selected,
+		}, nil
+	default:
+		return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
+	}
+}
+
+func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
+	if opts.DurationSec <= 0 {
+		opts.DurationSec = 300
+	}
+	// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
+	switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
+	case "", NvidiaStressLoaderBuiltin:
+		opts.Loader = NvidiaStressLoaderBuiltin
+	case NvidiaStressLoaderJohn:
+		opts.Loader = NvidiaStressLoaderJohn
+	case NvidiaStressLoaderNCCL:
+		opts.Loader = NvidiaStressLoaderNCCL
+	default:
+		opts.Loader = NvidiaStressLoaderBuiltin
+	}
+	opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
+	opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
+}
+
+func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
+	all, err := listNvidiaGPUIndices()
+	if err != nil {
+		return nil, err
+	}
+	if len(all) == 0 {
+		return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
+	}
+
+	selected := all
+	if len(include) > 0 {
+		want := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			want[idx] = struct{}{}
+		}
+		selected = selected[:0]
+		for _, idx := range all {
+			if _, ok := want[idx]; ok {
+				selected = append(selected, idx)
+			}
+		}
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
+	}
+	out := append([]int(nil), selected...)
+	sort.Ints(out)
+	return out, nil
+}
+
+func listNvidiaGPUIndices() ([]int, error) {
+	out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi: %w", err)
+	}
+	var indices []int
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		idx, err := strconv.Atoi(line)
+		if err != nil {
+			continue
+		}
+		indices = append(indices, idx)
+	}
+	return dedupeSortedIndices(indices), nil
+}
+
+func dedupeSortedIndices(values []int) []int {
+	if len(values) == 0 {
+		return nil
+	}
+	seen := make(map[int]struct{}, len(values))
+	out := make([]int, 0, len(values))
+	for _, value := range values {
+		if value < 0 {
+			continue
+		}
+		if _, ok := seen[value]; ok {
+			continue
+		}
+		seen[value] = struct{}{}
+		out = append(out, value)
+	}
+	sort.Ints(out)
+	return out
+}
+
+func joinIndexList(values []int) string {
+	parts := make([]string, 0, len(values))
+	for _, value := range values {
+		parts = append(parts, strconv.Itoa(value))
+	}
+	return strings.Join(parts, ",")
+}
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -0,0 +1,545 @@
+package platform
+
+import (
+	"archive/tar"
+	"bytes"
+	"compress/gzip"
+	"context"
+	"encoding/csv"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"syscall"
+	"time"
+)
+
+// PlatformStressCycle defines one load+idle cycle.
+type PlatformStressCycle struct {
+	LoadSec int // seconds of simultaneous CPU+GPU stress
+	IdleSec int // seconds of idle monitoring after load cut
+}
+
+// PlatformStressOptions controls the thermal cycling test.
+type PlatformStressOptions struct {
+	Cycles     []PlatformStressCycle
+	Components []string // if empty: run all; values: "cpu", "gpu"
+}
+
+// platformStressRow is one second of telemetry.
+type platformStressRow struct {
+	ElapsedSec   float64
+	Cycle        int
+	Phase        string // "load" | "idle"
+	CPULoadPct   float64
+	MaxCPUTempC  float64
+	MaxGPUTempC  float64
+	SysPowerW    float64
+	FanMinRPM    float64
+	FanMaxRPM    float64
+	GPUThrottled bool
+}
+
+// RunPlatformStress runs repeated load+idle thermal cycling.
+// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
+// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
+func (s *System) RunPlatformStress(
+	ctx context.Context,
+	baseDir string,
+	opts PlatformStressOptions,
+	logFunc func(string),
+) (string, error) {
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if len(opts.Cycles) == 0 {
+		return "", fmt.Errorf("no cycles defined")
+	}
+	if err := os.MkdirAll(baseDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
+	}
+
+	stamp := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir run dir: %w", err)
+	}
+
+	hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
+	hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
+
+	vendor := s.DetectGPUVendor()
+	logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
+
+	var rows []platformStressRow
+	start := time.Now()
+
+	var analyses []cycleAnalysis
+
+	for i, cycle := range opts.Cycles {
+		if ctx.Err() != nil {
+			break
+		}
+		cycleNum := i + 1
+		logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
+
+		// ── LOAD PHASE ───────────────────────────────────────────────────────
+		loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
+		var wg sync.WaitGroup
+
+		// CPU stress
+		if hasCPU {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				cpuCmd, err := buildCPUStressCmd(loadCtx)
+				if err != nil {
+					logFunc("CPU stress: " + err.Error())
+					return
+				}
+				_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
+			}()
+		}
+
+		// GPU stress
+		if hasGPU {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				gpuCmd := buildGPUStressCmd(loadCtx, vendor)
+				if gpuCmd == nil {
+					return
+				}
+				_ = gpuCmd.Wait()
+			}()
+		}
+
+		// Monitoring goroutine for load phase
+		loadRows := collectPhase(loadCtx, cycleNum, "load", start)
+		for _, r := range loadRows {
+			logFunc(formatPlatformRow(r))
+		}
+		rows = append(rows, loadRows...)
+		loadCancel()
+		wg.Wait()
+
+		if len(loadRows) > 0 {
+			logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
+		}
+
+		// ── IDLE PHASE ───────────────────────────────────────────────────────
+		idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
+		idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
+		for _, r := range idleRows {
+			logFunc(formatPlatformRow(r))
+		}
+		rows = append(rows, idleRows...)
+		idleCancel()
+
+		// Per-cycle analysis
+		an := analyzePlatformCycle(loadRows, idleRows)
+		analyses = append(analyses, an)
+		logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
+			cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
+	}
+
+	// Write CSV
+	csvData := writePlatformCSV(rows)
+	_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
+
+	// Write summary
+	summary := writePlatformSummary(opts, analyses)
+	logFunc("--- Summary ---")
+	for _, line := range strings.Split(summary, "\n") {
+		if line != "" {
+			logFunc(line)
+		}
+	}
+	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
+
+	// Pack tar.gz
+	archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
+	if err := packPlatformDir(runDir, archivePath); err != nil {
+		return "", fmt.Errorf("pack archive: %w", err)
+	}
+	_ = os.RemoveAll(runDir)
+	return archivePath, nil
+}
+
+// collectPhase samples live metrics every second until ctx is done.
+func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
+	var rows []platformStressRow
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return rows
+		case <-ticker.C:
+			sample := SampleLiveMetrics()
+			rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
+		}
+	}
+}
+
+func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
+	r := platformStressRow{
+		ElapsedSec: time.Since(testStart).Seconds(),
+		Cycle:      cycle,
+		Phase:      phase,
+		CPULoadPct: s.CPULoadPct,
+		SysPowerW:  s.PowerW,
+	}
+	for _, t := range s.Temps {
+		switch t.Group {
+		case "cpu":
+			if t.Celsius > r.MaxCPUTempC {
+				r.MaxCPUTempC = t.Celsius
+			}
+		case "gpu":
+			if t.Celsius > r.MaxGPUTempC {
+				r.MaxGPUTempC = t.Celsius
+			}
+		}
+	}
+	for _, g := range s.GPUs {
+		if g.TempC > r.MaxGPUTempC {
+			r.MaxGPUTempC = g.TempC
+		}
+	}
+	if len(s.Fans) > 0 {
+		r.FanMinRPM = s.Fans[0].RPM
+		r.FanMaxRPM = s.Fans[0].RPM
+		for _, f := range s.Fans[1:] {
+			if f.RPM < r.FanMinRPM {
+				r.FanMinRPM = f.RPM
+			}
+			if f.RPM > r.FanMaxRPM {
+				r.FanMaxRPM = f.RPM
+			}
+		}
+	}
+	return r
+}
+
+func formatPlatformRow(r platformStressRow) string {
+	throttle := ""
+	if r.GPUThrottled {
+		throttle = " THROTTLE"
+	}
+	fans := ""
+	if r.FanMinRPM > 0 {
+		fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
+	}
+	return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
+		r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
+}
+
+func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
+	var an cycleAnalysis
+	for _, r := range loadRows {
+		if r.MaxCPUTempC > an.maxCPUTemp {
+			an.maxCPUTemp = r.MaxCPUTempC
+		}
+		if r.MaxGPUTempC > an.maxGPUTemp {
+			an.maxGPUTemp = r.MaxGPUTempC
+		}
+		if r.SysPowerW > an.maxPower {
+			an.maxPower = r.SysPowerW
+		}
+		if r.GPUThrottled {
+			an.throttled = true
+		}
+	}
+	// Fan RPM at cut = avg of last 5 load rows
+	if n := len(loadRows); n > 0 {
+		window := loadRows
+		if n > 5 {
+			window = loadRows[n-5:]
+		}
+		var sum float64
+		var cnt int
+		for _, r := range window {
+			if r.FanMinRPM > 0 {
+				sum += (r.FanMinRPM + r.FanMaxRPM) / 2
+				cnt++
+			}
+		}
+		if cnt > 0 {
+			an.fanAtCutAvg = sum / float64(cnt)
+		}
+	}
+	// Fan RPM min in first 15s of idle
+	an.fanMin15s = an.fanAtCutAvg
+	var cutElapsed float64
+	if len(loadRows) > 0 {
+		cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
+	}
+	for _, r := range idleRows {
+		if r.ElapsedSec > cutElapsed+15 {
+			break
+		}
+		avg := (r.FanMinRPM + r.FanMaxRPM) / 2
+		if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
+			an.fanMin15s = avg
+		}
+	}
+	if an.fanAtCutAvg > 0 {
+		an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
+	}
+	return an
+}
+
+type cycleAnalysis struct {
+	maxCPUTemp  float64
+	maxGPUTemp  float64
+	maxPower    float64
+	throttled   bool
+	fanAtCutAvg float64
+	fanMin15s   float64
+	fanDropPct  float64
+}
+
+func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
+	fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
+
+	totalThrottle := 0
+	totalFanWarn := 0
+	for i, an := range analyses {
+		cycle := opts.Cycles[i]
+		fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
+		fmt.Fprintf(&b, "  Max CPU temp: %.1f°C\n", an.maxCPUTemp)
+		fmt.Fprintf(&b, "  Max GPU temp: %.1f°C\n", an.maxGPUTemp)
+		fmt.Fprintf(&b, "  Max sys power: %.0f W\n", an.maxPower)
+		if an.throttled {
+			fmt.Fprintf(&b, "  Throttle: DETECTED\n")
+			totalThrottle++
+		} else {
+			fmt.Fprintf(&b, "  Throttle: none\n")
+		}
+		if an.fanAtCutAvg > 0 {
+			fmt.Fprintf(&b, "  Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
+			fmt.Fprintf(&b, "  Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
+			if an.fanDropPct > 20 {
+				fmt.Fprintf(&b, "  Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
+				totalFanWarn++
+			} else {
+				fmt.Fprintf(&b, "  Fan response: OK\n")
+			}
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
+	if totalThrottle > 0 {
+		fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
+	} else if totalFanWarn > 0 {
+		fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
+	} else {
+		fmt.Fprintf(&b, "Overall: PASS\n")
+	}
+	return b.String()
+}
+
+func writePlatformCSV(rows []platformStressRow) []byte {
+	var buf bytes.Buffer
+	w := csv.NewWriter(&buf)
+	_ = w.Write([]string{
+		"elapsed_sec", "cycle", "phase",
+		"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
+		"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
+	})
+	for _, r := range rows {
+		throttled := "0"
+		if r.GPUThrottled {
+			throttled = "1"
+		}
+		_ = w.Write([]string{
+			strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
+			strconv.Itoa(r.Cycle),
+			r.Phase,
+			strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
+			strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
+			strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
+			strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
+			strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
+			strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
+			throttled,
+		})
+	}
+	w.Flush()
+	return buf.Bytes()
+}
+
+// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
+func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
+	path, err := satLookPath("stressapptest")
+	if err != nil {
+		return nil, fmt.Errorf("stressapptest not found: %w", err)
+	}
+	// Use a very long duration; the context timeout will kill it at the right time.
+	cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
+	if threads := platformStressCPUThreads(); threads > 0 {
+		cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
+	}
+	if mb := platformStressMemoryMB(); mb > 0 {
+		cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
+	}
+	cmd := exec.CommandContext(ctx, path, cmdArgs...)
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	if err := startLowPriorityCmd(cmd, 15); err != nil {
+		return nil, fmt.Errorf("stressapptest start: %w", err)
+	}
+	return cmd, nil
+}
+
+// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
+// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
+func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
+	switch strings.ToLower(vendor) {
+	case "amd":
+		return buildAMDGPUStressCmd(ctx)
+	case "nvidia":
+		return buildNvidiaGPUStressCmd(ctx)
+	}
+	return nil
+}
+
+func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
+	rvsArgs, err := resolveRVSCommand()
+	if err != nil {
+		return nil
+	}
+	rvsPath := rvsArgs[0]
+	cfg := `actions:
+- name: gst_platform
+  device: all
+  module: gst
+  parallel: true
+  duration: 86400000
+  copy_matrix: false
+  target_stress: 90
+  matrix_size_a: 8640
+  matrix_size_b: 8640
+  matrix_size_c: 8640
+`
+	cfgFile := "/tmp/bee-platform-gst.conf"
+	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
+	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	_ = startLowPriorityCmd(cmd, 10)
+	return cmd
+}
+
+func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
+	path, err := satLookPath("bee-gpu-burn")
+	if err != nil {
+		path, err = satLookPath("bee-gpu-stress")
+	}
+	if err != nil {
+		return nil
+	}
+	cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	_ = startLowPriorityCmd(cmd, 10)
+	return cmd
+}
+
+func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+	if cmd.Process != nil {
+		_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
+	}
+	return nil
+}
+
+func platformStressCPUThreads() int {
+	if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
+		return n
+	}
+	cpus := runtime.NumCPU()
+	switch {
+	case cpus <= 2:
+		return 1
+	case cpus <= 8:
+		return cpus - 1
+	default:
+		return cpus - 2
+	}
+}
+
+func platformStressMemoryMB() int {
+	if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
+		return mb
+	}
+	free := freeMemBytes()
+	if free <= 0 {
+		return 0
+	}
+	mb := int((free * 60) / 100 / (1024 * 1024))
+	if mb < 1024 {
+		return 1024
+	}
+	return mb
+}
+
+func containsComponent(components []string, name string) bool {
+	for _, c := range components {
+		if c == name {
+			return true
+		}
+	}
+	return false
+}
+
+func packPlatformDir(dir, dest string) error {
+	f, err := os.Create(dest)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	gz := gzip.NewWriter(f)
+	defer gz.Close()
+	tw := tar.NewWriter(gz)
+	defer tw.Close()
+
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return err
+	}
+	base := filepath.Base(dir)
+	for _, e := range entries {
+		if e.IsDir() {
+			continue
+		}
+		fpath := filepath.Join(dir, e.Name())
+		data, err := os.ReadFile(fpath)
+		if err != nil {
+			continue
+		}
+		hdr := &tar.Header{
+			Name:    filepath.Join(base, e.Name()),
+			Size:    int64(len(data)),
+			Mode:    0644,
+			ModTime: time.Now(),
+		}
+		if err := tw.WriteHeader(hdr); err != nil {
+			return err
+		}
+		if _, err := tw.Write(data); err != nil {
+			return err
+		}
+	}
+	return nil
+}
--- a/audit/internal/platform/platform_stress_test.go
+++ b/audit/internal/platform/platform_stress_test.go
@@ -0,0 +1,34 @@
+package platform
+
+import (
+	"runtime"
+	"testing"
+)
+
+func TestPlatformStressCPUThreadsOverride(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
+	if got := platformStressCPUThreads(); got != 7 {
+		t.Fatalf("platformStressCPUThreads=%d want 7", got)
+	}
+}
+
+func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
+	got := platformStressCPUThreads()
+	if got < 1 {
+		t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
+	}
+	if got > runtime.NumCPU() {
+		t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
+	}
+	if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
+		t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
+	}
+}
+
+func TestPlatformStressMemoryMBOverride(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
+	if got := platformStressMemoryMB(); got != 8192 {
+		t.Fatalf("platformStressMemoryMB=%d want 8192", got)
+	}
+}
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -136,7 +136,10 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 		tools = append(tools, s.CheckTools([]string{
 			"nvidia-smi",
 			"nvidia-bug-report.sh",
-			"bee-gpu-stress",
+			"bee-gpu-burn",
+			"bee-john-gpu-stress",
+			"bee-nccl-gpu-stress",
+			"all_reduce_perf",
 		})...)
 	case "amd":
 		tool := ToolStatus{Name: "rocm-smi"}
@@ -176,8 +179,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
 			health.DriverReady = true
 		}

-		if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
-			out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
+		if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
+			out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
 			if err == nil {
 				health.CUDAReady = true
 			} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -2,6 +2,8 @@ package platform

 import (
 	"archive/tar"
+	"bufio"
+	"bytes"
 	"compress/gzip"
 	"context"
 	"errors"
@@ -10,9 +12,11 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"syscall"
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 )

@@ -30,8 +34,46 @@ var (
 		"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
 		"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
 	}
+	rvsExecutableGlobs = []string{
+		"/opt/rocm/bin/rvs",
+		"/opt/rocm-*/bin/rvs",
+	}
 )

+// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
+// Returns combined stdout+stderr as a byte slice.
+func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
+	pr, pw := io.Pipe()
+	cmd.Stdout = pw
+	cmd.Stderr = pw
+
+	var buf bytes.Buffer
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		scanner := bufio.NewScanner(pr)
+		for scanner.Scan() {
+			line := scanner.Text()
+			buf.WriteString(line + "\n")
+			if logFunc != nil {
+				logFunc(line)
+			}
+		}
+	}()
+
+	err := cmd.Start()
+	if err != nil {
+		_ = pw.Close()
+		wg.Wait()
+		return nil, err
+	}
+	waitErr := cmd.Wait()
+	_ = pw.Close()
+	wg.Wait()
+	return buf.Bytes(), waitErr
+}
+
 // NvidiaGPU holds basic GPU info from nvidia-smi.
 type NvidiaGPU struct {
 	Index    int
@@ -53,6 +95,12 @@ func (s *System) DetectGPUVendor() string {
 	if _, err := os.Stat("/dev/kfd"); err == nil {
 		return "amd"
 	}
+	if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
+		text := strings.ToLower(string(raw))
+		if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
+			return "amd"
+		}
+	}
 	return ""
 }

@@ -80,13 +128,103 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
 }

 // RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
-func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) {
-	return runAcceptancePack(baseDir, "gpu-amd", []satJob{
+func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
 		{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
 		{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
 		{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
 		{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-	})
+	}, logFunc)
+}
+
+// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
+func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if err := ensureAMDRuntimeReady(); err != nil {
+		return "", err
+	}
+	cfgFile := "/tmp/bee-amd-mem.conf"
+	cfg := `actions:
+- name: mem_integrity
+  device: all
+  module: mem
+  parallel: true
+  duration: 60000
+  copy_matrix: false
+  target_stress: 90
+  matrix_size: 8640
+`
+	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
+		{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
+		{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
+		{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
+	}, logFunc)
+}
+
+// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
+func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if err := ensureAMDRuntimeReady(); err != nil {
+		return "", err
+	}
+	cfgFile := "/tmp/bee-amd-babel.conf"
+	cfg := `actions:
+- name: babel_mem_bw
+  device: all
+  module: babel
+  parallel: true
+  copy_matrix: true
+  target_stress: 90
+  matrix_size: 134217728
+`
+	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
+		{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
+		{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
+		{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
+		{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
+	}, logFunc)
+}
+
+// RunAMDStressPack runs an AMD GPU burn-in pack.
+// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
+func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	seconds := durationSec
+	if seconds <= 0 {
+		seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
+	}
+	if err := ensureAMDRuntimeReady(); err != nil {
+		return "", err
+	}
+	// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
+	rvsCfg := amdStressRVSConfig(seconds)
+	cfgFile := "/tmp/bee-amd-gst.conf"
+	_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
+
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
+}
+
+func amdStressRVSConfig(seconds int) string {
+	return fmt.Sprintf(`actions:
+- name: gst_stress
+  device: all
+  module: gst
+  parallel: true
+  duration: %d
+  copy_matrix: false
+  target_stress: 90
+  matrix_size_a: 8640
+  matrix_size_b: 8640
+  matrix_size_c: 8640
+`, seconds*1000)
+}
+
+func amdStressJobs(seconds int, cfgFile string) []satJob {
+	return []satJob{
+		{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
+		{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
+		{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
+		{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
+	}
 }

 // ListNvidiaGPUs returns GPUs visible to nvidia-smi.
@@ -123,7 +261,7 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {

 // RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 	// detect GPU count
 	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
 	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
@@ -136,44 +274,101 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, erro
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
 		}},
-	})
+	}, logFunc)
 }

-func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
-	return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
+func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
 }

 // RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
 // diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
 // gpuIndices: specific GPU indices to test (empty = all GPUs).
 // ctx cancellation kills the running job.
-func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
+func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
+	resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
 }

-func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
+func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
+	if len(gpuIndices) > 0 {
+		return dedupeSortedIndices(gpuIndices), nil
+	}
+	all, err := listNvidiaGPUIndices()
+	if err != nil {
+		return nil, err
+	}
+	if len(all) == 0 {
+		return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
+	}
+	return all, nil
+}
+
+func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 	sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
 	passes := envInt("BEE_MEMTESTER_PASSES", 1)
-	return runAcceptancePack(baseDir, "memory", []satJob{
+	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
 		{name: "03-free-after.log", cmd: []string{"free", "-h"}},
-	})
+	}, logFunc)
 }

-func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
+func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	seconds := durationSec
+	if seconds <= 0 {
+		seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
+	}
+	// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
+	sizeArg := "80%"
+	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
+		sizeArg = fmt.Sprintf("%dM", mb)
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
+		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
+		{name: "02-stress-ng-vm.log", cmd: []string{
+			"stress-ng", "--vm", "1",
+			"--vm-bytes", sizeArg,
+			"--vm-method", "all",
+			"--timeout", fmt.Sprintf("%d", seconds),
+			"--metrics-brief",
+		}},
+		{name: "03-free-after.log", cmd: []string{"free", "-h"}},
+	}, logFunc)
+}
+
+func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	seconds := durationSec
+	if seconds <= 0 {
+		seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
+	}
+	cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
+	if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
+		cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
+		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
+		{name: "02-stressapptest.log", cmd: cmd},
+		{name: "03-free-after.log", cmd: []string{"free", "-h"}},
+	}, logFunc)
+}
+
+func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 	if durationSec <= 0 {
 		durationSec = 60
 	}
-	return runAcceptancePack(baseDir, "cpu", []satJob{
+	return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
 		{name: "01-lscpu.log", cmd: []string{"lscpu"}},
 		{name: "02-sensors-before.log", cmd: []string{"sensors"}},
 		{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
 		{name: "04-sensors-after.log", cmd: []string{"sensors"}},
-	})
+	}, logFunc)
 }

-func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
+func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 	if baseDir == "" {
 		baseDir = "/var/log/bee-sat"
 	}
@@ -201,11 +396,17 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
 	}

 	for index, devPath := range devices {
+		if ctx.Err() != nil {
+			break
+		}
 		prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
 		commands := storageSATCommands(devPath)
 		for cmdIndex, job := range commands {
+			if ctx.Err() != nil {
+				break
+			}
 			name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
-			out, err := runSATCommand(verboseLog, job.name, job.cmd)
+			out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
 			if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
 				return "", writeErr
 			}
@@ -243,58 +444,15 @@ type satStats struct {
 }

 func nvidiaSATJobs() []satJob {
-	seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
-	sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
 	return []satJob{
 		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
 		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
 		{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
-		{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
+		{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
 	}
 }

-func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
-	if baseDir == "" {
-		baseDir = "/var/log/bee-sat"
-	}
-	ts := time.Now().UTC().Format("20060102-150405")
-	runDir := filepath.Join(baseDir, prefix+"-"+ts)
-	if err := os.MkdirAll(runDir, 0755); err != nil {
-		return "", err
-	}
-	verboseLog := filepath.Join(runDir, "verbose.log")
-
-	var summary strings.Builder
-	stats := satStats{}
-	fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
-	for _, job := range jobs {
-		cmd := make([]string, 0, len(job.cmd))
-		for _, arg := range job.cmd {
-			cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
-		}
-		out, err := runSATCommand(verboseLog, job.name, cmd)
-		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
-			return "", writeErr
-		}
-		status, rc := classifySATResult(job.name, out, err)
-		stats.Add(status)
-		key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
-		fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
-		fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
-	}
-	writeSATStats(&summary, stats)
-	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
-		return "", err
-	}
-
-	archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", err
-	}
-	return archive, nil
-}
-
 func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 	if diagLevel < 1 || diagLevel > 4 {
 		diagLevel = 3
@@ -315,7 +473,10 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 	}
 }

-func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
+func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
 	if baseDir == "" {
 		baseDir = "/var/log/bee-sat"
 	}
@@ -342,9 +503,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		var err error

 		if job.collectGPU {
-			out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
+			out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
 		} else {
-			out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
+			out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
 		}

 		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
@@ -368,13 +529,16 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	return archive, nil
 }

-func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
+func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
 	start := time.Now().UTC()
 	resolvedCmd, err := resolveSATCommand(cmd)
 	appendSATVerboseLog(verboseLog,
 		fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
 		"cmd: "+strings.Join(resolvedCmd, " "),
 	)
+	if logFunc != nil {
+		logFunc(fmt.Sprintf("=== %s ===", name))
+	}
 	if err != nil {
 		appendSATVerboseLog(verboseLog,
 			fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
@@ -386,10 +550,17 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
 	}

 	c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
+	c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	c.Cancel = func() error {
+		if c.Process != nil {
+			_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	if len(env) > 0 {
 		c.Env = append(os.Environ(), env...)
 	}
-	out, err := c.CombinedOutput()
+	out, err := streamExecOutput(c, logFunc)

 	rc := 0
 	if err != nil {
@@ -464,6 +635,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 	}

 	text := strings.ToLower(string(out))
+	// No output at all means the tool failed to start (mlock limit, binary missing,
+	// etc.) — we cannot say anything about hardware health → UNSUPPORTED.
+	if len(strings.TrimSpace(text)) == 0 {
+		return "UNSUPPORTED", rc
+	}
 	if strings.Contains(text, "unsupported") ||
 		strings.Contains(text, "not supported") ||
 		strings.Contains(text, "invalid opcode") ||
@@ -472,19 +648,25 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 		strings.Contains(text, "not available") ||
 		strings.Contains(text, "cuda_error_system_not_ready") ||
 		strings.Contains(text, "no such device") ||
+		// nvidia-smi on a machine with no NVIDIA GPU
+		strings.Contains(text, "couldn't communicate with the nvidia driver") ||
+		strings.Contains(text, "no nvidia gpu") ||
 		(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
 		return "UNSUPPORTED", rc
 	}
 	return "FAILED", rc
 }

-func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
+func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) {
 	start := time.Now().UTC()
 	resolvedCmd, err := resolveSATCommand(cmd)
 	appendSATVerboseLog(verboseLog,
 		fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
 		"cmd: "+strings.Join(resolvedCmd, " "),
 	)
+	if logFunc != nil {
+		logFunc(fmt.Sprintf("=== %s ===", name))
+	}
 	if err != nil {
 		appendSATVerboseLog(verboseLog,
 			fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
@@ -495,7 +677,7 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
 		return []byte(err.Error() + "\n"), err
 	}

-	out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
+	out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc)

 	rc := 0
 	if err != nil {
@@ -522,10 +704,27 @@ func resolveSATCommand(cmd []string) ([]string, error) {
 	if len(cmd) == 0 {
 		return nil, errors.New("empty SAT command")
 	}
-	if cmd[0] != "rocm-smi" {
-		return cmd, nil
+	switch cmd[0] {
+	case "rocm-smi":
+		return resolveROCmSMICommand(cmd[1:]...)
+	case "rvs":
+		return resolveRVSCommand(cmd[1:]...)
 	}
-	return resolveROCmSMICommand(cmd[1:]...)
+	path, err := satLookPath(cmd[0])
+	if err != nil {
+		return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
+	}
+	return append([]string{path}, cmd[1:]...), nil
+}
+
+func resolveRVSCommand(args ...string) ([]string, error) {
+	if path, err := satLookPath("rvs"); err == nil {
+		return append([]string{path}, args...), nil
+	}
+	for _, path := range expandExistingPaths(rvsExecutableGlobs) {
+		return append([]string{path}, args...), nil
+	}
+	return nil, errors.New("rvs not found in PATH or under /opt/rocm")
 }

 func resolveROCmSMICommand(args ...string) ([]string, error) {
@@ -549,6 +748,20 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
 	return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
 }

+func ensureAMDRuntimeReady() error {
+	if _, err := os.Stat("/dev/kfd"); err == nil {
+		return nil
+	}
+	if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
+		state := strings.TrimSpace(string(raw))
+		if strings.EqualFold(state, "live") {
+			return nil
+		}
+		return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
+	}
+	return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
+}
+
 func rocmSMIExecutableCandidates() []string {
 	return expandExistingPaths(rocmSMIExecutableGlobs)
 }
@@ -597,7 +810,7 @@ func parseStorageDevices(raw string) []string {

 // runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
 // On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
-func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
+func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) {
 	stopCh := make(chan struct{})
 	doneCh := make(chan struct{})
 	var metricRows []GPUMetricRow
@@ -625,7 +838,7 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
 		}
 	}()

-	out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
+	out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)

 	close(stopCh)
 	<-doneCh
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -2,10 +2,12 @@ package platform

 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -49,6 +51,18 @@ type FanStressRow struct {
 	SysPowerW    float64 // DCMI system power reading
 }

+type cachedPowerReading struct {
+	Value     float64
+	UpdatedAt time.Time
+}
+
+var (
+	systemPowerCacheMu sync.Mutex
+	systemPowerCache   cachedPowerReading
+)
+
+const systemPowerHoldTTL = 15 * time.Second
+
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -128,26 +142,21 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		stats.OK++
 	}

-	// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
+	// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
 	loadPhase := func(phaseName, stepName string, durSec int) {
 		if ctx.Err() != nil {
 			return
 		}
 		setPhase(phaseName)
-		var env []string
-		if len(opts.GPUIndices) > 0 {
-			ids := make([]string, len(opts.GPUIndices))
-			for i, idx := range opts.GPUIndices {
-				ids[i] = strconv.Itoa(idx)
-			}
-			env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
-		}
 		cmd := []string{
-			"bee-gpu-stress",
+			"bee-gpu-burn",
 			"--seconds", strconv.Itoa(durSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
-		out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
+		if len(opts.GPUIndices) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
+		}
+		out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
 		_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
 		if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
 			fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
@@ -304,41 +313,148 @@ func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
 // sampleFanSpeeds reads fan RPM values from ipmitool sdr.
 func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
+	if err == nil {
+		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
+			return fans, nil
+		}
+	}
+	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
+	if len(fans) > 0 {
+		return fans, nil
+	}
 	if err != nil {
 		return nil, err
 	}
-	return parseFanSpeeds(string(out)), nil
+	return nil, sensorsErr
 }

 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
-// Line format: "FAN1             | 2400.000   | RPM        | ok"
+// Handles two formats:
+//
+//	Old: "FAN1 | 2400.000 | RPM | ok"           (value in col[1], unit in col[2])
+//	New: "FAN1 | 41h | ok | 29.1 | 4340 RPM"   (value+unit combined in last col)
 func parseFanSpeeds(raw string) []FanReading {
 	var fans []FanReading
 	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
 		parts := strings.Split(line, "|")
-		if len(parts) < 3 {
+		if len(parts) < 2 {
 			continue
 		}
-		unit := strings.TrimSpace(parts[2])
-		if !strings.EqualFold(unit, "RPM") {
+		name := strings.TrimSpace(parts[0])
+		// Find the first field that contains "RPM" (either as a standalone unit or inline)
+		rpmVal := 0.0
+		found := false
+		for _, p := range parts[1:] {
+			p = strings.TrimSpace(p)
+			if !strings.Contains(strings.ToUpper(p), "RPM") {
+				continue
+			}
+			if strings.EqualFold(p, "RPM") {
+				continue // unit-only column in old format; value is in previous field
+			}
+			val, err := parseFanRPMValue(p)
+			if err == nil {
+				rpmVal = val
+				found = true
+				break
+			}
+		}
+		// Old format: unit "RPM" is in col[2], value is in col[1]
+		if !found && len(parts) >= 3 && strings.EqualFold(strings.TrimSpace(parts[2]), "RPM") {
+			valStr := strings.TrimSpace(parts[1])
+			if !strings.EqualFold(valStr, "na") && !strings.EqualFold(valStr, "disabled") && valStr != "" {
+				if val, err := parseFanRPMValue(valStr); err == nil {
+					rpmVal = val
+					found = true
+				}
+			}
+		}
+		if !found {
 			continue
 		}
-		valStr := strings.TrimSpace(parts[1])
-		if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
-			continue
-		}
-		val, err := strconv.ParseFloat(valStr, 64)
-		if err != nil {
-			continue
-		}
-		fans = append(fans, FanReading{
-			Name: strings.TrimSpace(parts[0]),
-			RPM:  val,
-		})
+		fans = append(fans, FanReading{Name: name, RPM: rpmVal})
 	}
 	return fans
 }

+func parseFanRPMValue(raw string) (float64, error) {
+	fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
+	if len(fields) == 0 {
+		return 0, strconv.ErrSyntax
+	}
+	return strconv.ParseFloat(fields[0], 64)
+}
+
+func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
+	out, err := exec.Command("sensors", "-j").Output()
+	if err != nil || len(out) == 0 {
+		return nil, err
+	}
+	var doc map[string]map[string]any
+	if err := json.Unmarshal(out, &doc); err != nil {
+		return nil, err
+	}
+	chips := make([]string, 0, len(doc))
+	for chip := range doc {
+		chips = append(chips, chip)
+	}
+	sort.Strings(chips)
+	var fans []FanReading
+	seen := map[string]struct{}{}
+	for _, chip := range chips {
+		features := doc[chip]
+		names := make([]string, 0, len(features))
+		for name := range features {
+			names = append(names, name)
+		}
+		sort.Strings(names)
+		for _, name := range names {
+			feature, ok := features[name].(map[string]any)
+			if !ok {
+				continue
+			}
+			rpm, ok := firstFanInputValue(feature)
+			if !ok || rpm <= 0 {
+				continue
+			}
+			label := strings.TrimSpace(name)
+			if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
+				label = chip + " / " + label
+			}
+			if _, ok := seen[label]; ok {
+				continue
+			}
+			seen[label] = struct{}{}
+			fans = append(fans, FanReading{Name: label, RPM: rpm})
+		}
+	}
+	return fans, nil
+}
+
+func firstFanInputValue(feature map[string]any) (float64, bool) {
+	keys := make([]string, 0, len(feature))
+	for key := range feature {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		lower := strings.ToLower(key)
+		if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
+			continue
+		}
+		switch value := feature[key].(type) {
+		case float64:
+			return value, true
+		case string:
+			f, err := strconv.ParseFloat(value, 64)
+			if err == nil {
+				return f, true
+			}
+		}
+	}
+	return 0, false
+}
+
 // sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
 func sampleCPUMaxTemp() float64 {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
@@ -404,11 +520,17 @@ func sampleCPUTempViaSensors() float64 {

 // sampleSystemPower reads system power draw via DCMI.
 func sampleSystemPower() float64 {
+	now := time.Now()
+	current := 0.0
 	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
-	if err != nil {
-		return 0
+	if err == nil {
+		current = parseDCMIPowerReading(string(out))
 	}
-	return parseDCMIPowerReading(string(out))
+	systemPowerCacheMu.Lock()
+	defer systemPowerCacheMu.Unlock()
+	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
+	systemPowerCache = updated
+	return value
 }

 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -431,6 +553,17 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }

+func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
+	if current > 0 {
+		cache = cachedPowerReading{Value: current, UpdatedAt: now}
+		return current, cache
+	}
+	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
+		return cache.Value, cache
+	}
+	return 0, cache
+}
+
 // analyzeThrottling returns true if any GPU reported an active throttle reason
 // during either load phase.
 func analyzeThrottling(rows []FanStressRow) bool {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -0,0 +1,67 @@
+package platform
+
+import (
+	"testing"
+	"time"
+)
+
+func TestParseFanSpeeds(t *testing.T) {
+	raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
+	got := parseFanSpeeds(raw)
+	if len(got) != 2 {
+		t.Fatalf("fans=%d want 2 (%v)", len(got), got)
+	}
+	if got[0].Name != "FAN1" || got[0].RPM != 2400 {
+		t.Fatalf("fan0=%+v", got[0])
+	}
+	if got[1].Name != "FAN2" || got[1].RPM != 1800 {
+		t.Fatalf("fan1=%+v", got[1])
+	}
+}
+
+func TestFirstFanInputValue(t *testing.T) {
+	feature := map[string]any{
+		"fan1_input": 9200.0,
+	}
+	got, ok := firstFanInputValue(feature)
+	if !ok || got != 9200 {
+		t.Fatalf("got=%v ok=%v", got, ok)
+	}
+}
+
+func TestParseDCMIPowerReading(t *testing.T) {
+	raw := `
+Instantaneous power reading:                   512 Watts
+Minimum during sampling period:               498 Watts
+`
+	if got := parseDCMIPowerReading(raw); got != 512 {
+		t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
+	}
+}
+
+func TestEffectiveSystemPowerReading(t *testing.T) {
+	now := time.Now()
+	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
+
+	got, updated := effectiveSystemPowerReading(cache, 0, now)
+	if got != 480 {
+		t.Fatalf("got=%v want cached 480", got)
+	}
+	if updated.Value != 480 {
+		t.Fatalf("updated=%+v", updated)
+	}
+
+	got, updated = effectiveSystemPowerReading(cache, 530, now)
+	if got != 530 {
+		t.Fatalf("got=%v want 530", got)
+	}
+	if updated.Value != 530 {
+		t.Fatalf("updated=%+v", updated)
+	}
+
+	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
+	got, _ = effectiveSystemPowerReading(expired, 0, now)
+	if got != 0 {
+		t.Fatalf("expired cache returned %v want 0", got)
+	}
+}
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -5,6 +5,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"strings"
 	"testing"
 )

@@ -30,21 +31,59 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
 	if len(jobs) != 5 {
 		t.Fatalf("jobs=%d want 5", len(jobs))
 	}
-	if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
-		t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
+	if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
+		t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
 	}
 	if got := jobs[3].cmd[1]; got != "--output-file" {
 		t.Fatalf("bug report flag=%q want --output-file", got)
 	}
 }

-func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
-	t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
-	t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
+func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
+	t.Parallel()

+	cfg := amdStressRVSConfig(123)
+	if !strings.Contains(cfg, "module: gst") {
+		t.Fatalf("config missing gst module:\n%s", cfg)
+	}
+	if strings.Contains(cfg, "module: mem") {
+		t.Fatalf("config should not include mem module:\n%s", cfg)
+	}
+	if !strings.Contains(cfg, "copy_matrix: false") {
+		t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
+	}
+	if strings.Count(cfg, "duration: 123000") != 1 {
+		t.Fatalf("config should apply duration once:\n%s", cfg)
+	}
+	for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
+		if !strings.Contains(cfg, field) {
+			t.Fatalf("config missing %s:\n%s", field, cfg)
+		}
+	}
+}
+
+func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
+	t.Parallel()
+
+	jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
+	if len(jobs) != 4 {
+		t.Fatalf("jobs=%d want 4", len(jobs))
+	}
+	if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
+		t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
+	}
+	if got := jobs[2].cmd[0]; got != "rvs" {
+		t.Fatalf("jobs[2]=%q want rvs", got)
+	}
+	if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
+		t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
+	}
+}
+
+func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	jobs := nvidiaSATJobs()
 	got := jobs[4].cmd
-	want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
+	want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
 	if len(got) != len(want) {
 		t.Fatalf("cmd len=%d want %d", len(got), len(want))
 	}
@@ -55,6 +94,126 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
 	}
 }

+func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
+	t.Parallel()
+
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	job, err := buildNvidiaStressJob(NvidiaStressOptions{
+		DurationSec:       600,
+		Loader:            NvidiaStressLoaderJohn,
+		ExcludeGPUIndices: []int{1},
+	})
+	if err != nil {
+		t.Fatalf("buildNvidiaStressJob error: %v", err)
+	}
+	wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
+	if len(job.cmd) != len(wantCmd) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
+	}
+	for i := range wantCmd {
+		if job.cmd[i] != wantCmd[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
+		}
+	}
+	if got := joinIndexList(job.gpuIndices); got != "0,2" {
+		t.Fatalf("gpuIndices=%q want 0,2", got)
+	}
+}
+
+func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
+	t.Parallel()
+
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	job, err := buildNvidiaStressJob(NvidiaStressOptions{
+		DurationSec: 120,
+		Loader:      NvidiaStressLoaderNCCL,
+		GPUIndices:  []int{2, 0},
+	})
+	if err != nil {
+		t.Fatalf("buildNvidiaStressJob error: %v", err)
+	}
+	wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
+	if len(job.cmd) != len(wantCmd) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
+	}
+	for i := range wantCmd {
+		if job.cmd[i] != wantCmd[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
+		}
+	}
+	if got := joinIndexList(job.gpuIndices); got != "0,2" {
+		t.Fatalf("gpuIndices=%q want 0,2", got)
+	}
+}
+
+func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
+	t.Parallel()
+
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	got, err := resolveDCGMGPUIndices(nil)
+	if err != nil {
+		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
+	}
+	if want := "0,1,2"; joinIndexList(got) != want {
+		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
+	}
+}
+
+func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
+	t.Parallel()
+
+	got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
+	if err != nil {
+		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
+	}
+	if want := "1,3"; joinIndexList(got) != want {
+		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
+	}
+}
+
+func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		loader string
+		want   string
+	}{
+		{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
+		{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
+		{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
+		{loader: "", want: "gpu-nvidia-burn"},
+	}
+	for _, tt := range tests {
+		if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
+			t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
+		}
+	}
+}
+
 func TestEnvIntFallback(t *testing.T) {
 	os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
 	if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
@@ -80,8 +239,8 @@ func TestClassifySATResult(t *testing.T) {
 	}{
 		{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
 		{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
-		{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
-		{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
+		{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
+		{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 	}

 	for _, tt := range tests {
@@ -130,6 +289,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
 	}
 }

+func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
+	oldLookPath := satLookPath
+	satLookPath = func(file string) (string, error) {
+		if file == "stress-ng" {
+			return "/usr/bin/stress-ng", nil
+		}
+		return "", exec.ErrNotFound
+	}
+	t.Cleanup(func() { satLookPath = oldLookPath })
+
+	cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
+	if err != nil {
+		t.Fatalf("resolveSATCommand error: %v", err)
+	}
+	if len(cmd) != 3 {
+		t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
+	}
+	if cmd[0] != "/usr/bin/stress-ng" {
+		t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
+	}
+}
+
+func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
+	oldLookPath := satLookPath
+	satLookPath = func(file string) (string, error) {
+		return "", exec.ErrNotFound
+	}
+	t.Cleanup(func() { satLookPath = oldLookPath })
+
+	_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
+		t.Fatalf("error=%q", err)
+	}
+}
+
 func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
 	tmp := t.TempDir()
 	execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -17,6 +17,10 @@ func (s *System) ListBeeServices() ([]string, error) {
 		}
 		for _, match := range matches {
 			name := strings.TrimSuffix(filepath.Base(match), ".service")
+			// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
+			if strings.HasSuffix(name, "@") {
+				continue
+			}
 			if !seen[name] {
 				seen[name] = true
 				out = append(out, name)
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -2,12 +2,31 @@ package platform

 type System struct{}

+type LiveBootSource struct {
+	InRAM  bool   `json:"in_ram"`
+	Kind   string `json:"kind"`
+	Source string `json:"source,omitempty"`
+	Device string `json:"device,omitempty"`
+}
+
 type InterfaceInfo struct {
 	Name  string
 	State string
 	IPv4  []string
 }

+type NetworkInterfaceSnapshot struct {
+	Name string
+	Up   bool
+	IPv4 []string
+}
+
+type NetworkSnapshot struct {
+	Interfaces    []NetworkInterfaceSnapshot
+	DefaultRoutes []string
+	ResolvConf    string
+}
+
 type ServiceAction string

 const (
@@ -39,6 +58,20 @@ type ToolStatus struct {
 	OK   bool
 }

+const (
+	NvidiaStressLoaderBuiltin = "builtin"
+	NvidiaStressLoaderJohn    = "john"
+	NvidiaStressLoaderNCCL    = "nccl"
+)
+
+type NvidiaStressOptions struct {
+	DurationSec       int
+	SizeMB            int
+	Loader            string
+	GPUIndices        []int
+	ExcludeGPUIndices []int
+}
+
 func New() *System {
 	return &System{}
 }
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -0,0 +1,92 @@
+package webui
+
+import (
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
+	t.Setenv("DISPLAY", "")
+	t.Setenv("XAUTHORITY", "")
+
+	cmd := xrandrCommand("--query")
+
+	var hasDisplay bool
+	var hasXAuthority bool
+	for _, kv := range cmd.Env {
+		if kv == "DISPLAY=:0" {
+			hasDisplay = true
+		}
+		if kv == "XAUTHORITY=/home/bee/.Xauthority" {
+			hasXAuthority = true
+		}
+	}
+	if !hasDisplay {
+		t.Fatalf("DISPLAY not injected: %v", cmd.Env)
+	}
+	if !hasXAuthority {
+		t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
+	}
+}
+
+func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
+	req.ContentLength = -1
+	rec := httptest.NewRecorder()
+
+	h.handleAPISATRun("cpu").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
+		t.Fatalf("burn profile=%q want smoke", got)
+	}
+}
+
+
+func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
+	h := &handler{}
+	h.pushFanRings([]platform.FanReading{
+		{Name: "FAN_A", RPM: 4200},
+		{Name: "FAN_B", RPM: 5100},
+	})
+	h.pushFanRings([]platform.FanReading{
+		{Name: "FAN_B", RPM: 5200},
+	})
+
+	if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
+		t.Fatalf("fanNames=%v", h.fanNames)
+	}
+	aVals, _ := h.ringFans[0].snapshot()
+	bVals, _ := h.ringFans[1].snapshot()
+	if len(aVals) != 2 || len(bVals) != 2 {
+		t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
+	}
+	if aVals[1] != 4200 {
+		t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
+	}
+	if bVals[1] != 5200 {
+		t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
+	}
+}
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -1,18 +1,21 @@
 package webui

 import (
+	"os"
+	"strings"
 	"sync"
 	"time"
 )

 // jobState holds the output lines and completion status of an async job.
 type jobState struct {
-	lines  []string
-	done   bool
-	err    string
-	mu     sync.Mutex
-	subs   []chan string
-	cancel func() // optional cancel function; nil if job is not cancellable
+	lines   []string
+	done    bool
+	err     string
+	mu      sync.Mutex
+	subs    []chan string
+	cancel  func() // optional cancel function; nil if job is not cancellable
+	logPath string
 }

 // abort cancels the job if it has a cancel function and is not yet done.
@@ -30,6 +33,9 @@ func (j *jobState) append(line string) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
 	j.lines = append(j.lines, line)
+	if j.logPath != "" {
+		appendJobLog(j.logPath, line)
+	}
 	for _, ch := range j.subs {
 		select {
 		case ch <- line:
@@ -78,12 +84,12 @@ func (m *jobManager) create(id string) *jobState {
 	j := &jobState{}
 	m.jobs[id] = j
 	// Schedule cleanup after 30 minutes
-	go func() {
+	goRecoverOnce("job cleanup", func() {
 		time.Sleep(30 * time.Minute)
 		m.mu.Lock()
 		delete(m.jobs, id)
 		m.mu.Unlock()
-	}()
+	})
 	return j
 }

@@ -100,3 +106,32 @@ func (m *jobManager) get(id string) (*jobState, bool) {
 	j, ok := m.jobs[id]
 	return j, ok
 }
+
+func newTaskJobState(logPath string) *jobState {
+	j := &jobState{logPath: logPath}
+	if logPath == "" {
+		return j
+	}
+	data, err := os.ReadFile(logPath)
+	if err != nil || len(data) == 0 {
+		return j
+	}
+	lines := strings.Split(strings.ReplaceAll(string(data), "\r\n", "\n"), "\n")
+	if len(lines) > 0 && lines[len(lines)-1] == "" {
+		lines = lines[:len(lines)-1]
+	}
+	j.lines = append(j.lines, lines...)
+	return j
+}
+
+func appendJobLog(path, line string) {
+	if path == "" {
+		return
+	}
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
+	if err != nil {
+		return
+	}
+	defer f.Close()
+	_, _ = f.WriteString(line + "\n")
+}
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -0,0 +1,241 @@
+package webui
+
+import (
+	"bufio"
+	"io"
+	"log/slog"
+	"os"
+	"strings"
+	"sync"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
+// It supports multiple concurrent SAT tasks: a shared event window is open
+// while any SAT task is running, and flushed when all tasks complete.
+type kmsgWatcher struct {
+	mu          sync.Mutex
+	activeCount int // number of in-flight SAT tasks
+	window      *kmsgWindow
+	statusDB    *app.ComponentStatusDB
+}
+
+type kmsgWindow struct {
+	targets   []string // SAT targets running concurrently
+	startedAt time.Time
+	seen      map[kmsgEventKey]bool
+	events    []kmsgEvent
+}
+
+type kmsgEventKey struct {
+	id       string // BDF or device name
+	category string
+}
+
+type kmsgEvent struct {
+	timestamp time.Time
+	raw       string
+	ids       []string // BDF addresses or device names extracted
+	category  string
+}
+
+func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
+	return &kmsgWatcher{statusDB: statusDB}
+}
+
+// start launches the background kmsg reading goroutine.
+func (w *kmsgWatcher) start() {
+	goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
+}
+
+func (w *kmsgWatcher) run() {
+	for {
+		f, err := os.Open("/dev/kmsg")
+		if err != nil {
+			slog.Warn("kmsg watcher unavailable", "err", err)
+			time.Sleep(30 * time.Second)
+			continue
+		}
+		// Best-effort seek to end so we only capture events from now forward.
+		_, _ = f.Seek(0, io.SeekEnd)
+
+		scanner := bufio.NewScanner(f)
+		scanner.Buffer(make([]byte, 64*1024), 64*1024)
+		for scanner.Scan() {
+			line := scanner.Text()
+			evt, ok := parseKmsgLine(line)
+			if !ok {
+				continue
+			}
+			w.mu.Lock()
+			if w.window != nil {
+				w.recordEvent(evt)
+			}
+			w.mu.Unlock()
+		}
+		if err := scanner.Err(); err != nil {
+			slog.Warn("kmsg watcher stopped", "err", err)
+		}
+		_ = f.Close()
+		time.Sleep(2 * time.Second)
+	}
+}
+
+// recordEvent appends evt to the active window, deduplicating by (id, category).
+// Must be called with w.mu held.
+func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
+	if len(evt.ids) == 0 {
+		key := kmsgEventKey{id: "", category: evt.category}
+		if !w.window.seen[key] {
+			w.window.seen[key] = true
+			w.window.events = append(w.window.events, evt)
+		}
+		return
+	}
+	for _, id := range evt.ids {
+		key := kmsgEventKey{id: id, category: evt.category}
+		if !w.window.seen[key] {
+			w.window.seen[key] = true
+			w.window.events = append(w.window.events, evt)
+		}
+	}
+}
+
+// NotifyTaskStarted increments the active task counter and opens a shared event window
+// if this is the first task starting.
+func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.activeCount == 0 {
+		w.window = &kmsgWindow{
+			startedAt: time.Now(),
+			seen:      make(map[kmsgEventKey]bool),
+		}
+	}
+	w.activeCount++
+	if w.window != nil {
+		w.window.targets = append(w.window.targets, target)
+	}
+}
+
+// NotifyTaskFinished decrements the active task counter. When all tasks finish,
+// it flushes the accumulated events to the status DB.
+func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
+	w.mu.Lock()
+	w.activeCount--
+	var window *kmsgWindow
+	if w.activeCount <= 0 {
+		w.activeCount = 0
+		window = w.window
+		w.window = nil
+	}
+	w.mu.Unlock()
+
+	if window == nil || len(window.events) == 0 {
+		return
+	}
+	goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
+}
+
+func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
+	if w.statusDB == nil {
+		return
+	}
+	source := "watchdog:kmsg"
+	// Collect unique component keys from events.
+	seen := map[string]string{} // componentKey → first raw line
+	for _, evt := range window.events {
+		if len(evt.ids) == 0 {
+			// MCE or un-identified error.
+			key := "cpu:all"
+			if evt.category == "memory" {
+				key = "memory:all"
+			}
+			if _, exists := seen[key]; !exists {
+				seen[key] = evt.raw
+			}
+			continue
+		}
+		for _, id := range evt.ids {
+			var key string
+			switch evt.category {
+			case "gpu", "pcie":
+				key = "pcie:" + normalizeBDF(id)
+			case "storage":
+				key = "storage:" + id
+			default:
+				key = "pcie:" + normalizeBDF(id)
+			}
+			if _, exists := seen[key]; !exists {
+				seen[key] = evt.raw
+			}
+		}
+	}
+	for key, detail := range seen {
+		detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
+		w.statusDB.Record(key, source, "Warning", detail)
+	}
+}
+
+// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
+// any pattern in platform.HardwareErrorPatterns.
+// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
+func parseKmsgLine(raw string) (kmsgEvent, bool) {
+	msg := raw
+	if idx := strings.Index(raw, ";"); idx >= 0 {
+		msg = strings.TrimSpace(raw[idx+1:])
+	}
+	if msg == "" {
+		return kmsgEvent{}, false
+	}
+
+	for _, p := range platform.HardwareErrorPatterns {
+		m := p.Re.FindStringSubmatch(msg)
+		if m == nil {
+			continue
+		}
+		evt := kmsgEvent{
+			timestamp: time.Now(),
+			raw:       msg,
+			category:  p.Category,
+		}
+		if p.BDFGroup > 0 && p.BDFGroup < len(m) {
+			evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
+		}
+		if p.DevGroup > 0 && p.DevGroup < len(m) {
+			evt.ids = append(evt.ids, m[p.DevGroup])
+		}
+		return evt, true
+	}
+	return kmsgEvent{}, false
+}
+
+// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
+func normalizeBDF(bdf string) string {
+	bdf = strings.ToLower(strings.TrimSpace(bdf))
+	if strings.Count(bdf, ":") == 1 {
+		return "0000:" + bdf
+	}
+	return bdf
+}
+
+func truncate(s string, max int) string {
+	if len(s) <= max {
+		return s
+	}
+	return s[:max] + "..."
+}
+
+// isSATTarget returns true for task targets that run hardware acceptance tests.
+func isSATTarget(target string) bool {
+	switch target {
+	case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
+		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
+		"platform-stress":
+		return true
+	}
+	return false
+}
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -0,0 +1,372 @@
+package webui
+
+import (
+	"database/sql"
+	"encoding/csv"
+	"io"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
+	_ "modernc.org/sqlite"
+)
+
+const metricsDBPath = "/appdata/bee/metrics.db"
+
+// MetricsDB persists live metric samples to SQLite.
+type MetricsDB struct {
+	db *sql.DB
+}
+
+// openMetricsDB opens (or creates) the metrics database at the given path.
+func openMetricsDB(path string) (*MetricsDB, error) {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return nil, err
+	}
+	db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
+	if err != nil {
+		return nil, err
+	}
+	db.SetMaxOpenConns(1)
+	if err := initMetricsSchema(db); err != nil {
+		_ = db.Close()
+		return nil, err
+	}
+	return &MetricsDB{db: db}, nil
+}
+
+func initMetricsSchema(db *sql.DB) error {
+	_, err := db.Exec(`
+CREATE TABLE IF NOT EXISTS sys_metrics (
+  ts           INTEGER NOT NULL,
+  cpu_load_pct REAL,
+  mem_load_pct REAL,
+  power_w      REAL,
+  PRIMARY KEY (ts)
+);
+CREATE TABLE IF NOT EXISTS gpu_metrics (
+  ts            INTEGER NOT NULL,
+  gpu_index     INTEGER NOT NULL,
+  temp_c        REAL,
+  usage_pct     REAL,
+  mem_usage_pct REAL,
+  power_w       REAL,
+  clock_mhz     REAL,
+  mem_clock_mhz REAL,
+  PRIMARY KEY (ts, gpu_index)
+);
+CREATE TABLE IF NOT EXISTS fan_metrics (
+  ts   INTEGER NOT NULL,
+  name TEXT NOT NULL,
+  rpm  REAL,
+  PRIMARY KEY (ts, name)
+);
+CREATE TABLE IF NOT EXISTS temp_metrics (
+  ts      INTEGER NOT NULL,
+  name    TEXT NOT NULL,
+  grp     TEXT NOT NULL,
+  celsius REAL,
+  PRIMARY KEY (ts, name)
+);
+`)
+	if err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
+		return err
+	}
+	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+}
+
+func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
+	rows, err := db.Query("PRAGMA table_info(" + table + ")")
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+
+	for rows.Next() {
+		var cid int
+		var name, ctype string
+		var notNull, pk int
+		var dflt sql.NullString
+		if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
+			return err
+		}
+		if strings.EqualFold(name, column) {
+			return nil
+		}
+	}
+	if err := rows.Err(); err != nil {
+		return err
+	}
+	_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
+	return err
+}
+
+// Write inserts one sample into all relevant tables.
+func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
+	ts := s.Timestamp.Unix()
+	tx, err := m.db.Begin()
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+
+	_, err = tx.Exec(
+		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
+		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
+	)
+	if err != nil {
+		return err
+	}
+	for _, g := range s.GPUs {
+		_, err = tx.Exec(
+			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
+			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
+		)
+		if err != nil {
+			return err
+		}
+	}
+	for _, f := range s.Fans {
+		_, err = tx.Exec(
+			`INSERT OR REPLACE INTO fan_metrics(ts,name,rpm) VALUES(?,?,?)`,
+			ts, f.Name, f.RPM,
+		)
+		if err != nil {
+			return err
+		}
+	}
+	for _, t := range s.Temps {
+		_, err = tx.Exec(
+			`INSERT OR REPLACE INTO temp_metrics(ts,name,grp,celsius) VALUES(?,?,?,?)`,
+			ts, t.Name, t.Group, t.Celsius,
+		)
+		if err != nil {
+			return err
+		}
+	}
+	return tx.Commit()
+}
+
+// LoadRecent returns up to n samples in chronological order (oldest first).
+func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
+}
+
+// LoadAll returns all persisted samples in chronological order (oldest first).
+func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+}
+
+// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
+func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
+	rows, err := m.db.Query(query, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	type sysRow struct {
+		ts            int64
+		cpu, mem, pwr float64
+	}
+	var sysRows []sysRow
+	for rows.Next() {
+		var r sysRow
+		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
+			continue
+		}
+		sysRows = append(sysRows, r)
+	}
+	if len(sysRows) == 0 {
+		return nil, nil
+	}
+	// Collect min/max ts for range query
+	minTS := sysRows[0].ts
+	maxTS := sysRows[len(sysRows)-1].ts
+
+	// Load GPU rows in range
+	type gpuKey struct {
+		ts  int64
+		idx int
+	}
+	gpuData := map[gpuKey]platform.GPUMetricRow{}
+	gRows, err := m.db.Query(
+		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
+		minTS, maxTS,
+	)
+	if err == nil {
+		defer gRows.Close()
+		for gRows.Next() {
+			var ts int64
+			var g platform.GPUMetricRow
+			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
+				gpuData[gpuKey{ts, g.GPUIndex}] = g
+			}
+		}
+	}
+
+	// Load fan rows in range
+	type fanKey struct {
+		ts   int64
+		name string
+	}
+	fanData := map[fanKey]float64{}
+	fRows, err := m.db.Query(
+		`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
+	)
+	if err == nil {
+		defer fRows.Close()
+		for fRows.Next() {
+			var ts int64
+			var name string
+			var rpm float64
+			if err := fRows.Scan(&ts, &name, &rpm); err == nil {
+				fanData[fanKey{ts, name}] = rpm
+			}
+		}
+	}
+
+	// Load temp rows in range
+	type tempKey struct {
+		ts   int64
+		name string
+	}
+	tempData := map[tempKey]platform.TempReading{}
+	tRows, err := m.db.Query(
+		`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
+	)
+	if err == nil {
+		defer tRows.Close()
+		for tRows.Next() {
+			var ts int64
+			var t platform.TempReading
+			if err := tRows.Scan(&ts, &t.Name, &t.Group, &t.Celsius); err == nil {
+				tempData[tempKey{ts, t.Name}] = t
+			}
+		}
+	}
+
+	// Collect unique GPU indices and fan/temp names from loaded data.
+	// Sort each list so that sample reconstruction is deterministic regardless
+	// of Go's non-deterministic map iteration order.
+	seenGPU := map[int]bool{}
+	var gpuIndices []int
+	for k := range gpuData {
+		if !seenGPU[k.idx] {
+			seenGPU[k.idx] = true
+			gpuIndices = append(gpuIndices, k.idx)
+		}
+	}
+	sort.Ints(gpuIndices)
+
+	seenFan := map[string]bool{}
+	var fanNames []string
+	for k := range fanData {
+		if !seenFan[k.name] {
+			seenFan[k.name] = true
+			fanNames = append(fanNames, k.name)
+		}
+	}
+	sort.Strings(fanNames)
+
+	seenTemp := map[string]bool{}
+	var tempNames []string
+	for k := range tempData {
+		if !seenTemp[k.name] {
+			seenTemp[k.name] = true
+			tempNames = append(tempNames, k.name)
+		}
+	}
+	sort.Strings(tempNames)
+
+	samples := make([]platform.LiveMetricSample, len(sysRows))
+	for i, r := range sysRows {
+		s := platform.LiveMetricSample{
+			Timestamp:  time.Unix(r.ts, 0).UTC(),
+			CPULoadPct: r.cpu,
+			MemLoadPct: r.mem,
+			PowerW:     r.pwr,
+		}
+		for _, idx := range gpuIndices {
+			if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
+				s.GPUs = append(s.GPUs, g)
+			}
+		}
+		for _, name := range fanNames {
+			if rpm, ok := fanData[fanKey{r.ts, name}]; ok {
+				s.Fans = append(s.Fans, platform.FanReading{Name: name, RPM: rpm})
+			}
+		}
+		for _, name := range tempNames {
+			if t, ok := tempData[tempKey{r.ts, name}]; ok {
+				s.Temps = append(s.Temps, t)
+			}
+		}
+		samples[i] = s
+	}
+	return samples, nil
+}
+
+// ExportCSV writes all sys+gpu data as CSV to w.
+func (m *MetricsDB) ExportCSV(w io.Writer) error {
+	rows, err := m.db.Query(`
+		SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
+		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
+		       g.clock_mhz, g.mem_clock_mhz
+		FROM sys_metrics s
+		LEFT JOIN gpu_metrics g ON g.ts = s.ts
+		ORDER BY s.ts, g.gpu_index
+	`)
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+
+	cw := csv.NewWriter(w)
+	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
+	for rows.Next() {
+		var ts int64
+		var cpu, mem, pwr float64
+		var gpuIdx sql.NullInt64
+		var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
+		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
+			continue
+		}
+		row := []string{
+			strconv.FormatInt(ts, 10),
+			strconv.FormatFloat(cpu, 'f', 2, 64),
+			strconv.FormatFloat(mem, 'f', 2, 64),
+			strconv.FormatFloat(pwr, 'f', 1, 64),
+		}
+		if gpuIdx.Valid {
+			row = append(row,
+				strconv.FormatInt(gpuIdx.Int64, 10),
+				strconv.FormatFloat(gpuTemp.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
+			)
+		} else {
+			row = append(row, "", "", "", "", "", "", "")
+		}
+		_ = cw.Write(row)
+	}
+	cw.Flush()
+	return cw.Error()
+}
+
+// Close closes the database.
+func (m *MetricsDB) Close() { _ = m.db.Close() }
+
+func nullFloat(v float64) sql.NullFloat64 {
+	return sql.NullFloat64{Float64: v, Valid: true}
+}
--- a/audit/internal/webui/metricsdb_test.go
+++ b/audit/internal/webui/metricsdb_test.go
@@ -0,0 +1,145 @@
+package webui
+
+import (
+	"database/sql"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"bee/audit/internal/platform"
+	_ "modernc.org/sqlite"
+)
+
+func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
+	db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	base := time.Unix(1_700_000_000, 0).UTC()
+	for i := 0; i < 3; i++ {
+		err := db.Write(platform.LiveMetricSample{
+			Timestamp:  base.Add(time.Duration(i) * time.Second),
+			CPULoadPct: float64(10 + i),
+			MemLoadPct: float64(20 + i),
+			PowerW:     float64(300 + i),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, PowerW: float64(100 + i)},
+				{GPUIndex: 2, PowerW: float64(200 + i)},
+			},
+		})
+		if err != nil {
+			t.Fatalf("Write(%d): %v", i, err)
+		}
+	}
+
+	all, err := db.LoadAll()
+	if err != nil {
+		t.Fatalf("LoadAll: %v", err)
+	}
+	if len(all) != 3 {
+		t.Fatalf("LoadAll len=%d want 3", len(all))
+	}
+	for i, sample := range all {
+		if len(sample.GPUs) != 2 {
+			t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
+		}
+		if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
+			t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
+		}
+		if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
+			t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
+		}
+	}
+
+	recent, err := db.LoadRecent(2)
+	if err != nil {
+		t.Fatalf("LoadRecent: %v", err)
+	}
+	if len(recent) != 2 {
+		t.Fatalf("LoadRecent len=%d want 2", len(recent))
+	}
+	if !recent[0].Timestamp.Before(recent[1].Timestamp) {
+		t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
+	}
+	for i, sample := range recent {
+		if len(sample.GPUs) != 2 {
+			t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
+		}
+	}
+}
+
+func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "metrics.db")
+	raw, err := sql.Open("sqlite", path)
+	if err != nil {
+		t.Fatalf("sql.Open: %v", err)
+	}
+	_, err = raw.Exec(`
+CREATE TABLE gpu_metrics (
+  ts            INTEGER NOT NULL,
+  gpu_index     INTEGER NOT NULL,
+  temp_c        REAL,
+  usage_pct     REAL,
+  mem_usage_pct REAL,
+  power_w       REAL,
+  PRIMARY KEY (ts, gpu_index)
+);
+CREATE TABLE sys_metrics (
+  ts           INTEGER NOT NULL,
+  cpu_load_pct REAL,
+  mem_load_pct REAL,
+  power_w      REAL,
+  PRIMARY KEY (ts)
+);
+CREATE TABLE fan_metrics (
+  ts   INTEGER NOT NULL,
+  name TEXT NOT NULL,
+  rpm  REAL,
+  PRIMARY KEY (ts, name)
+);
+CREATE TABLE temp_metrics (
+  ts      INTEGER NOT NULL,
+  name    TEXT NOT NULL,
+  grp     TEXT NOT NULL,
+  celsius REAL,
+  PRIMARY KEY (ts, name)
+);
+`)
+	if err != nil {
+		t.Fatalf("create legacy schema: %v", err)
+	}
+	_ = raw.Close()
+
+	db, err := openMetricsDB(path)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	now := time.Unix(1_700_000_100, 0).UTC()
+	err = db.Write(platform.LiveMetricSample{
+		Timestamp: now,
+		GPUs: []platform.GPUMetricRow{
+			{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
+		},
+	})
+	if err != nil {
+		t.Fatalf("Write: %v", err)
+	}
+
+	samples, err := db.LoadAll()
+	if err != nil {
+		t.Fatalf("LoadAll: %v", err)
+	}
+	if len(samples) != 1 || len(samples[0].GPUs) != 1 {
+		t.Fatalf("samples=%+v", samples)
+	}
+	if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
+		t.Fatalf("ClockMHz=%v want 1410", got)
+	}
+	if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
+		t.Fatalf("MemClockMHz=%v want 2600", got)
+	}
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -7,9 +7,368 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
+
+	"bee/audit/internal/platform"
 )

-func TestRootRendersShellWithIframe(t *testing.T) {
+func TestChartLegendNumber(t *testing.T) {
+	tests := []struct {
+		in   float64
+		want string
+	}{
+		{in: 0.4, want: "0"},
+		{in: 61.5, want: "62"},
+		{in: 999.4, want: "999"},
+		{in: 1200, want: "1,2k"},
+		{in: 1250, want: "1,25k"},
+		{in: 1310, want: "1,31k"},
+		{in: 1500, want: "1,5k"},
+		{in: 2600, want: "2,6k"},
+		{in: 10200, want: "10k"},
+	}
+	for _, tc := range tests {
+		if got := chartLegendNumber(tc.in); got != tc.want {
+			t.Fatalf("chartLegendNumber(%v)=%q want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
+func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
+	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		panic("boom")
+	}))
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/panic", nil)
+
+	handler.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusInternalServerError {
+		t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
+	}
+	if !strings.Contains(rec.Body.String(), "internal server error") {
+		t.Fatalf("body=%q", rec.Body.String())
+	}
+}
+
+func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
+	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if !sseStart(w) {
+			return
+		}
+		if !sseWrite(w, "tick", "ok") {
+			t.Fatal("expected sse write to succeed")
+		}
+	}))
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/stream", nil)
+
+	handler.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
+		t.Fatalf("content-type=%q", got)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
+		t.Fatalf("body=%q", body)
+	}
+}
+
+func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp:  time.Now().Add(-3 * time.Minute),
+			CPULoadPct: 10,
+			MemLoadPct: 20,
+			PowerW:     300,
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
+			},
+		},
+		{
+			Timestamp:  time.Now().Add(-2 * time.Minute),
+			CPULoadPct: 30,
+			MemLoadPct: 40,
+			PowerW:     320,
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
+			},
+		},
+		{
+			Timestamp:  time.Now().Add(-1 * time.Minute),
+			CPULoadPct: 50,
+			MemLoadPct: 60,
+			PowerW:     340,
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
+			},
+		},
+	}
+
+	datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	if !ok {
+		t.Fatal("chartDataFromSamples returned ok=false")
+	}
+	if title != "GPU Power" {
+		t.Fatalf("title=%q", title)
+	}
+	if len(names) != 1 || names[0] != "GPU 0" {
+		t.Fatalf("names=%v", names)
+	}
+	if len(labels) != len(samples) {
+		t.Fatalf("labels len=%d want %d", len(labels), len(samples))
+	}
+	if len(datasets) != 1 || len(datasets[0]) != len(samples) {
+		t.Fatalf("datasets shape=%v", datasets)
+	}
+	if got := datasets[0][0]; got != 120 {
+		t.Fatalf("datasets[0][0]=%v want 120", got)
+	}
+	if got := datasets[0][2]; got != 130 {
+		t.Fatalf("datasets[0][2]=%v want 130", got)
+	}
+}
+
+func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp: time.Now().Add(-2 * time.Minute),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 7, PowerW: 170},
+				{GPUIndex: 2, PowerW: 120},
+				{GPUIndex: 0, PowerW: 100},
+			},
+		},
+		{
+			Timestamp: time.Now().Add(-1 * time.Minute),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, PowerW: 101},
+				{GPUIndex: 7, PowerW: 171},
+				{GPUIndex: 2, PowerW: 121},
+			},
+		},
+	}
+
+	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	if !ok {
+		t.Fatal("chartDataFromSamples returned ok=false")
+	}
+	if title != "GPU Power" {
+		t.Fatalf("title=%q", title)
+	}
+	wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
+	if len(names) != len(wantNames) {
+		t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
+	}
+	for i := range wantNames {
+		if names[i] != wantNames[i] {
+			t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
+		}
+	}
+	if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
+		t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
+	}
+	if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
+		t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
+	}
+	if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
+		t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
+	}
+}
+
+func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp: time.Now().Add(-2 * time.Minute),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, ClockMHz: 1400, MemClockMHz: 2600},
+				{GPUIndex: 3, ClockMHz: 1500, MemClockMHz: 2800},
+			},
+		},
+		{
+			Timestamp: time.Now().Add(-1 * time.Minute),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2610},
+				{GPUIndex: 3, ClockMHz: 1510, MemClockMHz: 2810},
+			},
+		},
+	}
+
+	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
+	if !ok {
+		t.Fatal("gpu-all-clock returned ok=false")
+	}
+	if title != "GPU Core Clock" {
+		t.Fatalf("title=%q", title)
+	}
+	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
+		t.Fatalf("names=%v", names)
+	}
+	if got := datasets[1][1]; got != 1510 {
+		t.Fatalf("GPU 3 core clock=%v want 1510", got)
+	}
+
+	datasets, names, _, title, _, _, ok = chartDataFromSamples("gpu-all-memclock", samples)
+	if !ok {
+		t.Fatal("gpu-all-memclock returned ok=false")
+	}
+	if title != "GPU Memory Clock" {
+		t.Fatalf("title=%q", title)
+	}
+	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
+		t.Fatalf("names=%v", names)
+	}
+	if got := datasets[0][0]; got != 2600 {
+		t.Fatalf("GPU 0 memory clock=%v want 2600", got)
+	}
+}
+
+func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
+	got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
+	want := []float64{0, 480, 480, 480, 510, 510}
+	if len(got) != len(want) {
+		t.Fatalf("len=%d want %d", len(got), len(want))
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
+		}
+	}
+}
+
+func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
+	body := renderMetrics()
+	if !strings.Contains(body, "const probe = new Image();") {
+		t.Fatalf("metrics page should preload chart images before swap: %s", body)
+	}
+	if !strings.Contains(body, "el.dataset.loading === '1'") {
+		t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
+	}
+	if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
+		t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
+	}
+	if !strings.Contains(body, `id="gpu-chart-toggle"`) {
+		t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
+	}
+	if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
+		t.Fatalf("metrics page should include GPU core clock chart: %s", body)
+	}
+	if !strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
+		t.Fatalf("metrics page should include GPU memory clock chart: %s", body)
+	}
+	if !strings.Contains(body, `renderGPUOverviewCards(indices)`) {
+		t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
+	}
+}
+
+func TestChartLegendVisible(t *testing.T) {
+	if !chartLegendVisible(8) {
+		t.Fatal("legend should stay visible for charts with up to 8 series")
+	}
+	if chartLegendVisible(9) {
+		t.Fatal("legend should be hidden for charts with more than 8 series")
+	}
+}
+
+func TestChartYAxisNumber(t *testing.T) {
+	tests := []struct {
+		in   float64
+		want string
+	}{
+		{in: 999, want: "999"},
+		{in: 1000, want: "1к"},
+		{in: 1370, want: "1,4к"},
+		{in: 1500, want: "1,5к"},
+		{in: 1700, want: "1,7к"},
+		{in: 2000, want: "2к"},
+		{in: 9999, want: "10к"},
+		{in: 10200, want: "10к"},
+		{in: -1500, want: "-1,5к"},
+	}
+	for _, tc := range tests {
+		if got := chartYAxisNumber(tc.in); got != tc.want {
+			t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
+func TestChartCanvasHeight(t *testing.T) {
+	if got := chartCanvasHeight(4); got != 360 {
+		t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
+	}
+	if got := chartCanvasHeight(12); got != 288 {
+		t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
+	}
+}
+
+func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
+	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
+	want := []float64{4200, 4200, 4200, 4300, 4300}
+	if len(got) != len(want) {
+		t.Fatalf("len=%d want %d", len(got), len(want))
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
+		}
+	}
+}
+
+func TestChartYAxisOption(t *testing.T) {
+	min := floatPtr(0)
+	max := floatPtr(100)
+	opt := chartYAxisOption(min, max)
+	if opt.Min != min || opt.Max != max {
+		t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
+	}
+	if opt.LabelCount != 11 {
+		t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
+	}
+	if got := opt.ValueFormatter(1000); got != "1к" {
+		t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
+	}
+}
+
+func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
+	r1 := newMetricsRing(4)
+	r2 := newMetricsRing(4)
+	r1.push(1000)
+	r1.push(1100)
+	r2.push(1200)
+	r2.push(1300)
+
+	datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
+	if len(datasets) != 2 {
+		t.Fatalf("datasets=%d want 2", len(datasets))
+	}
+	if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
+		t.Fatalf("names=%v", names)
+	}
+	if len(labels) != 2 {
+		t.Fatalf("labels=%v want 2 entries", labels)
+	}
+	if labels[0] == "" || labels[1] == "" {
+		t.Fatalf("labels should contain timeline values, got %v", labels)
+	}
+}
+
+func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
+	body := renderNetworkInline()
+	if !strings.Contains(body, "d.pending_change") {
+		t.Fatalf("network UI should read pending network state from API: %s", body)
+	}
+	if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
+		t.Fatalf("network UI should periodically refresh network state: %s", body)
+	}
+	if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
+		t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
+	}
+}
+
+func TestRootRendersDashboard(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
 	exportDir := filepath.Join(dir, "export")
@@ -21,9 +380,10 @@ func TestRootRendersShellWithIframe(t *testing.T) {
 	}

 	handler := NewHandler(HandlerOptions{
-		Title:     "Bee Hardware Audit",
-		AuditPath: path,
-		ExportDir: exportDir,
+		Title:      "Bee Hardware Audit",
+		BuildLabel: "1.2.3",
+		AuditPath:  path,
+		ExportDir:  exportDir,
 	})

 	first := httptest.NewRecorder()
@@ -31,11 +391,17 @@ func TestRootRendersShellWithIframe(t *testing.T) {
 	if first.Code != http.StatusOK {
 		t.Fatalf("first status=%d", first.Code)
 	}
-	if !strings.Contains(first.Body.String(), `iframe`) || !strings.Contains(first.Body.String(), `src="/viewer"`) {
-		t.Fatalf("first body missing iframe viewer: %s", first.Body.String())
+	// Dashboard should contain the audit nav link and hardware summary
+	if !strings.Contains(first.Body.String(), `href="/audit"`) {
+		t.Fatalf("first body missing audit nav link: %s", first.Body.String())
 	}
-	if !strings.Contains(first.Body.String(), "/export/support.tar.gz") {
-		t.Fatalf("first body missing support bundle link: %s", first.Body.String())
+	if !strings.Contains(first.Body.String(), `/viewer`) {
+		t.Fatalf("first body missing viewer link: %s", first.Body.String())
+	}
+	versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
+	navIdx := strings.Index(first.Body.String(), `href="/"`)
+	if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
+		t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
 	}
 	if got := first.Header().Get("Cache-Control"); got != "no-store" {
 		t.Fatalf("first cache-control=%q", got)
@@ -50,8 +416,135 @@ func TestRootRendersShellWithIframe(t *testing.T) {
 	if second.Code != http.StatusOK {
 		t.Fatalf("second status=%d", second.Code)
 	}
-	if !strings.Contains(second.Body.String(), `src="/viewer"`) {
-		t.Fatalf("second body missing iframe viewer: %s", second.Body.String())
+	if !strings.Contains(second.Body.String(), `Hardware Summary`) {
+		t.Fatalf("second body missing hardware summary: %s", second.Body.String())
+	}
+}
+
+func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
+	dir := t.TempDir()
+	exportDir := filepath.Join(dir, "export")
+	if err := os.MkdirAll(exportDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{
+		Title:     "Bee Hardware Audit",
+		AuditPath: filepath.Join(dir, "missing-audit.json"),
+		ExportDir: exportDir,
+	})
+
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `Run Audit`) {
+		t.Fatalf("dashboard missing run audit button: %s", body)
+	}
+	if strings.Contains(body, `No audit data`) {
+		t.Fatalf("dashboard still shows empty audit badge: %s", body)
+	}
+}
+
+func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "audit.json")
+	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{AuditPath: path})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `iframe class="viewer-frame" src="/viewer"`) {
+		t.Fatalf("audit page missing viewer frame: %s", body)
+	}
+	if !strings.Contains(body, `openAuditModal()`) {
+		t.Fatalf("audit page missing action modal trigger: %s", body)
+	}
+}
+
+func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `id="task-log-overlay"`) {
+		t.Fatalf("tasks page missing log modal overlay: %s", body)
+	}
+	if !strings.Contains(body, `_taskPageSize = 50`) {
+		t.Fatalf("tasks page missing pagination size config: %s", body)
+	}
+	if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
+		t.Fatalf("tasks page missing pagination controls: %s", body)
+	}
+}
+
+func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `Restart GPU Drivers`) {
+		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
+	}
+	if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
+		t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
+	}
+	if !strings.Contains(body, `id="boot-source-text"`) {
+		t.Fatalf("tools page missing boot source field: %s", body)
+	}
+	if !strings.Contains(body, `Export to USB`) {
+		t.Fatalf("tools page missing export to usb section: %s", body)
+	}
+	if !strings.Contains(body, `Support Bundle</button>`) {
+		t.Fatalf("tools page missing support bundle usb button: %s", body)
+	}
+}
+
+func TestTasksPageRendersScrollableLogModal(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "audit.json")
+	exportDir := filepath.Join(dir, "export")
+	if err := os.MkdirAll(exportDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{
+		Title:     "Bee Hardware Audit",
+		AuditPath: path,
+		ExportDir: exportDir,
+	})
+
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `height:calc(100vh - 32px)`) {
+		t.Fatalf("tasks page missing bounded log modal height: %s", body)
+	}
+	if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
+		t.Fatalf("tasks page missing log modal overflow guard: %s", body)
+	}
+	if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
+		t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
 	}
 }

@@ -103,8 +596,8 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
-	if got := strings.TrimSpace(rec.Body.String()); got != body {
-		t.Fatalf("body=%q want %q", got, body)
+	if !strings.Contains(rec.Body.String(), "SERIAL-API") {
+		t.Fatalf("body missing expected serial: %s", rec.Body.String())
 	}
 	if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
 		t.Fatalf("content-type=%q", got)
@@ -129,6 +622,17 @@ func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
 		t.Fatal(err)
 	}
+	archive, err := os.CreateTemp(os.TempDir(), "bee-support-server-test-*.tar.gz")
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Remove(archive.Name()) })
+	if _, err := archive.WriteString("support-bundle"); err != nil {
+		t.Fatal(err)
+	}
+	if err := archive.Close(); err != nil {
+		t.Fatal(err)
+	}

 	handler := NewHandler(HandlerOptions{ExportDir: exportDir})
 	rec := httptest.NewRecorder()
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -0,0 +1,42 @@
+package webui
+
+import (
+	"fmt"
+	"log/slog"
+	"runtime/debug"
+	"time"
+)
+
+func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
+	go func() {
+		for {
+			if !runRecoverable(name, fn) {
+				return
+			}
+			if restartDelay > 0 {
+				time.Sleep(restartDelay)
+			}
+		}
+	}()
+}
+
+func goRecoverOnce(name string, fn func()) {
+	go func() {
+		_ = runRecoverable(name, fn)
+	}()
+}
+
+func runRecoverable(name string, fn func()) (panicked bool) {
+	defer func() {
+		if rec := recover(); rec != nil {
+			panicked = true
+			slog.Error("recovered panic",
+				"component", name,
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+		}
+	}()
+	fn()
+	return false
+}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -0,0 +1,518 @@
+package webui
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"bee/audit/internal/app"
+)
+
+func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
+	dir := t.TempDir()
+	q := &taskQueue{
+		statePath: filepath.Join(dir, "tasks-state.json"),
+		logsDir:   filepath.Join(dir, "tasks"),
+		trigger:   make(chan struct{}, 1),
+	}
+	if err := os.MkdirAll(q.logsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	started := time.Now().Add(-time.Minute)
+
+	// A task that was pending (not yet started) must be re-queued on restart.
+	pendingTask := &Task{
+		ID:        "task-pending",
+		Name:      "Memory Burn-in",
+		Target:    "memory-stress",
+		Priority:  2,
+		Status:    TaskPending,
+		CreatedAt: time.Now().Add(-2 * time.Minute),
+		params:    taskParams{Duration: 300, BurnProfile: "smoke"},
+	}
+	// A task that was running when bee-web crashed must NOT be re-queued —
+	// its child processes (e.g. gpu-burn-worker) survive the restart in
+	// their own process groups and can't be cancelled retroactively.
+	runningTask := &Task{
+		ID:        "task-running",
+		Name:      "NVIDIA GPU Stress",
+		Target:    "nvidia-stress",
+		Priority:  1,
+		Status:    TaskRunning,
+		CreatedAt: time.Now().Add(-3 * time.Minute),
+		StartedAt: &started,
+		params:    taskParams{Duration: 86400},
+	}
+	for _, task := range []*Task{pendingTask, runningTask} {
+		q.tasks = append(q.tasks, task)
+		q.assignTaskLogPathLocked(task)
+	}
+	q.persistLocked()
+
+	recovered := &taskQueue{
+		statePath: q.statePath,
+		logsDir:   q.logsDir,
+		trigger:   make(chan struct{}, 1),
+	}
+	recovered.loadLocked()
+
+	if len(recovered.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(recovered.tasks))
+	}
+
+	byID := map[string]*Task{}
+	for i := range recovered.tasks {
+		byID[recovered.tasks[i].ID] = recovered.tasks[i]
+	}
+
+	// Pending task must be re-queued as pending with params intact.
+	p := byID["task-pending"]
+	if p == nil {
+		t.Fatal("task-pending not found")
+	}
+	if p.Status != TaskPending {
+		t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
+	}
+	if p.StartedAt != nil {
+		t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
+	}
+	if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
+		t.Fatalf("pending task: params=%+v", p.params)
+	}
+	if p.LogPath == "" {
+		t.Fatal("pending task: expected log path")
+	}
+
+	// Running task must be marked failed, not re-queued, to prevent
+	// launching duplicate workers (e.g. a second set of gpu-burn-workers).
+	r := byID["task-running"]
+	if r == nil {
+		t.Fatal("task-running not found")
+	}
+	if r.Status != TaskFailed {
+		t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
+	}
+	if r.ErrMsg == "" {
+		t.Fatal("running task: expected non-empty error message")
+	}
+	if r.DoneAt == nil {
+		t.Fatal("running task: expected done_at to be set")
+	}
+}
+
+func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "task.log")
+	if err := os.WriteFile(path, []byte("line1\nline2\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	j := newTaskJobState(path)
+	existing, ch := j.subscribe()
+	if ch == nil {
+		t.Fatal("expected live subscription channel")
+	}
+	if len(existing) != 2 || existing[0] != "line1" || existing[1] != "line2" {
+		t.Fatalf("existing=%v", existing)
+	}
+}
+
+func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
+	now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
+	q := &taskQueue{
+		tasks: []*Task{
+			{
+				ID:        "old-running",
+				Name:      "Old Running",
+				Status:    TaskRunning,
+				Priority:  10,
+				CreatedAt: now.Add(-3 * time.Minute),
+			},
+			{
+				ID:        "new-done",
+				Name:      "New Done",
+				Status:    TaskDone,
+				Priority:  0,
+				CreatedAt: now.Add(-1 * time.Minute),
+			},
+			{
+				ID:        "mid-pending",
+				Name:      "Mid Pending",
+				Status:    TaskPending,
+				Priority:  1,
+				CreatedAt: now.Add(-2 * time.Minute),
+			},
+		},
+	}
+
+	got := q.snapshot()
+	if len(got) != 3 {
+		t.Fatalf("snapshot len=%d want 3", len(got))
+	}
+	if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
+		t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
+	}
+}
+
+func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
+	dir := t.TempDir()
+	logPath := filepath.Join(dir, "task.log")
+	if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "done-1",
+		Name:      "Done Task",
+		Status:    TaskDone,
+		CreatedAt: time.Now(),
+		LogPath:   logPath,
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
+	req.SetPathValue("id", "done-1")
+	rec := httptest.NewRecorder()
+
+	h := &handler{}
+	h.handleAPITasksStream(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
+		t.Fatalf("body=%q", body)
+	}
+	if !strings.Contains(body, "event: done\n") {
+		t.Fatalf("missing done event: %q", body)
+	}
+}
+
+func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "pending-1",
+		Name:      "Pending Task",
+		Status:    TaskPending,
+		CreatedAt: time.Now(),
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
+	req.SetPathValue("id", "pending-1")
+	rec := httptest.NewRecorder()
+
+	done := make(chan struct{})
+	go func() {
+		h := &handler{}
+		h.handleAPITasksStream(rec, req)
+		close(done)
+	}()
+
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
+			cancel()
+			<-done
+			if rec.Code != http.StatusOK {
+				t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+			}
+			return
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	cancel()
+	<-done
+	t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
+}
+
+func TestResolveBurnPreset(t *testing.T) {
+	tests := []struct {
+		profile string
+		want    burnPreset
+	}{
+		{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
+		{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
+		{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
+		{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
+	}
+	for _, tc := range tests {
+		if got := resolveBurnPreset(tc.profile); got != tc.want {
+			t.Fatalf("resolveBurnPreset(%q)=%+v want %+v", tc.profile, got, tc.want)
+		}
+	}
+}
+
+func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
+	tests := []struct {
+		loader string
+		want   string
+	}{
+		{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
+		{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
+		{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
+		{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
+	}
+	for _, tc := range tests {
+		if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
+			t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
+		}
+	}
+}
+
+func TestRunTaskHonorsCancel(t *testing.T) {
+	blocked := make(chan struct{})
+	released := make(chan struct{})
+	aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
+		close(blocked)
+		select {
+		case <-ctx.Done():
+			close(released)
+			return "", ctx.Err()
+		case <-time.After(5 * time.Second):
+			close(released)
+			return "unexpected", nil
+		}
+	}
+
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "cpu-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{Duration: 60},
+	}
+	j := &jobState{}
+	ctx, cancel := context.WithCancel(context.Background())
+	j.cancel = cancel
+	tk.job = j
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+		return aRun(nil, ctx, baseDir, durationSec, logFunc)
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	done := make(chan struct{})
+	go func() {
+		q.runTask(tk, j, ctx)
+		close(done)
+	}()
+
+	<-blocked
+	j.abort()
+
+	select {
+	case <-released:
+	case <-time.After(2 * time.Second):
+		t.Fatal("task did not observe cancel")
+	}
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("runTask did not return after cancel")
+	}
+}
+
+func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
+	var gotDuration int
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "cpu-burn-1",
+		Name:      "CPU Burn-in",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{BurnProfile: "smoke"},
+	}
+	j := &jobState{}
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
+		gotDuration = durationSec
+		return "/tmp/cpu-burn.tar.gz", nil
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotDuration != 5*60 {
+		t.Fatalf("duration=%d want %d", gotDuration, 5*60)
+	}
+}
+
+func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
+	dir := t.TempDir()
+	q := &taskQueue{
+		opts: &HandlerOptions{ExportDir: dir},
+	}
+	tk := &Task{
+		ID:        "support-bundle-1",
+		Name:      "Support Bundle",
+		Target:    "support-bundle",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}
+	j := &jobState{}
+
+	var gotExportDir string
+	orig := buildSupportBundle
+	buildSupportBundle = func(exportDir string) (string, error) {
+		gotExportDir = exportDir
+		return filepath.Join(exportDir, "bundle.tar.gz"), nil
+	}
+	defer func() { buildSupportBundle = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotExportDir != dir {
+		t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
+	}
+	if j.err != "" {
+		t.Fatalf("unexpected error: %q", j.err)
+	}
+	if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
+		t.Fatalf("lines=%v", j.lines)
+	}
+}
+
+func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
+	now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
+	created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
+	started := time.Time{}
+	task := &Task{
+		Status:    TaskRunning,
+		CreatedAt: created,
+		StartedAt: &started,
+	}
+	if got := taskElapsedSec(task, now); got != 0 {
+		t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
+	}
+
+	stale := created.Add(-24 * time.Hour)
+	task.StartedAt = &stale
+	if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
+		t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
+	}
+}
+
+func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
+	q := &taskQueue{
+		opts: &HandlerOptions{},
+	}
+	tk := &Task{
+		ID:        "install-1",
+		Name:      "Install to Disk",
+		Target:    "install",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{Device: "/dev/sda"},
+	}
+	j := &jobState{}
+
+	var gotDevice string
+	var gotLogPath string
+	orig := installCommand
+	installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
+		gotDevice = device
+		gotLogPath = logPath
+		return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
+	}
+	defer func() { installCommand = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotDevice != "/dev/sda" {
+		t.Fatalf("device=%q want /dev/sda", gotDevice)
+	}
+	if gotLogPath == "" {
+		t.Fatal("expected install log path")
+	}
+	logs := strings.Join(j.lines, "\n")
+	if !strings.Contains(logs, "Install log: ") {
+		t.Fatalf("missing install log line: %v", j.lines)
+	}
+	if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
+		t.Fatalf("missing streamed output: %v", j.lines)
+	}
+	if j.err != "" {
+		t.Fatalf("unexpected error: %q", j.err)
+	}
+}
+
+func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
+	dir := t.TempDir()
+	q := &taskQueue{
+		opts:        &HandlerOptions{App: &app.App{}},
+		statePath:   filepath.Join(dir, "tasks-state.json"),
+		logsDir:     filepath.Join(dir, "tasks"),
+		kmsgWatcher: newKmsgWatcher(nil),
+	}
+	tk := &Task{
+		ID:        "cpu-panic-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}
+	j := &jobState{}
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
+		panic("boom")
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	q.executeTask(tk, j, context.Background())
+
+	if tk.Status != TaskFailed {
+		t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
+	}
+	if tk.DoneAt == nil {
+		t.Fatal("expected done_at to be set")
+	}
+	if !strings.Contains(tk.ErrMsg, "task panic: boom") {
+		t.Fatalf("task error=%q", tk.ErrMsg)
+	}
+	if !strings.Contains(j.err, "task panic: boom") {
+		t.Fatalf("job error=%q", j.err)
+	}
+	q.kmsgWatcher.mu.Lock()
+	activeCount := q.kmsgWatcher.activeCount
+	window := q.kmsgWatcher.window
+	q.kmsgWatcher.mu.Unlock()
+	if activeCount != 0 {
+		t.Fatalf("activeCount=%d want 0", activeCount)
+	}
+	if window != nil {
+		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
+	}
+}
--- a/audit/scripts/resolve-version.sh
+++ b/audit/scripts/resolve-version.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+set -eu
+
+tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
+
+case "${tag}" in
+	v*)
+		printf '%s\n' "${tag#v}"
+		;;
+	"")
+		printf 'dev\n'
+		;;
+	*)
+		printf '%s\n' "${tag}"
+		;;
+esac
--- a/2
+++ b/2
--- a/bible-local/architecture/charting.md
+++ b/bible-local/architecture/charting.md
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
 and polled by the browser every 2 seconds via `<img src="...?t=now">`.
 There is no client-side canvas or JS chart library.

+## Rule: live charts must be visually uniform
+
+Live charts are a single UI family, not a set of one-off widgets. New charts and
+changes to existing charts must keep the same rendering model and presentation
+rules unless there is an explicit architectural decision to diverge.
+
+Default expectations:
+
+- same server-side SVG pipeline for all live metrics charts
+- same refresh behaviour and failure handling in the browser
+- same canvas size class and card layout
+- same legend placement policy across charts
+- same axis, title, and summary conventions
+- no chart-specific visual exceptions added as a quick fix
+
+Current default for live charts:
+
+- legend below the plot area when a chart has 8 series or fewer
+- legend hidden when a chart has more than 8 series
+- 10 equal Y-axis steps across the chart height
+- 1400 x 360 SVG canvas with legend
+- 1400 x 288 SVG canvas without legend
+- full-width card rendering in a single-column stack
+
+If one chart needs a different layout or legend behaviour, treat that as a
+design-level decision affecting the whole chart family, not as a local tweak to
+just one endpoint.
+
 ### Why go-analyze/charts

 - Pure Go, no CGO — builds cleanly inside the live-build container
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
 | `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
 | `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |

-Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
+Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
+the legend is hidden. The page renders them at `width: 100%` in a
 single-column layout so they always fill the viewport width.

 ### Ring buffers
--- a/bible-local/architecture/runtime-flows.md
+++ b/bible-local/architecture/runtime-flows.md
@@ -60,6 +60,8 @@ Rules:
 - Chromium opens `http://localhost/` — the full interactive web UI
 - SSH is independent from the desktop path
 - serial console support is enabled for VM boot debugging
+- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
+- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`

 ## ISO build sequence

@@ -81,9 +83,9 @@ build-in-container.sh [--authorized-keys /path/to/keys]
  7. `build-cublas.sh`:
       a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
       b. verify packages against repo `Packages.gz`
-       c. extract headers for `bee-gpu-stress` build
+       c. extract headers for `bee-gpu-burn` worker build
       d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
-  8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
+  8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
  9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
  10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
  11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
@@ -104,7 +106,7 @@ Build host notes:
  1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
  2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
 - NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
+- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
 - The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
 - The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
 - The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
@@ -153,18 +155,17 @@ Current validation state:
 Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.

 Acceptance flows:
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
+- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
+- NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
 - `bee sat memory` → `memtester` archive
 - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
 - SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
+- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
  - Ampere: `fp16` + `fp32`/TF32 tensor-core load
  - Ada / Hopper: add `fp8`
  - Blackwell+: add `fp4`
  - PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
 - Runtime overrides:
-  - `BEE_GPU_STRESS_SECONDS`
-  - `BEE_GPU_STRESS_SIZE_MB`
  - `BEE_MEMTESTER_SIZE_MB`
  - `BEE_MEMTESTER_PASSES`

@@ -179,6 +180,6 @@ Web UI: Acceptance Tests page → Run Test button
 ```

 **Critical invariants:**
- `bee-gpu-stress` uses `exec.CommandContext` — killed on job context cancel.
+- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
 - Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
 - SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
--- a/bible-local/architecture/system-overview.md
+++ b/bible-local/architecture/system-overview.md
@@ -21,8 +21,8 @@ Fills gaps where Redfish/logpile is blind:
 - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
 - Machine-readable health summary derived from collector verdicts
 - Operator-triggered acceptance tests for NVIDIA, memory, and storage
- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
+- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
+- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
 - Automatic boot audit with operator-facing local console and SSH access
 - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
 - SSH access (OpenSSH) always available for inspection and debugging
@@ -70,7 +70,7 @@ Fills gaps where Redfish/logpile is blind:
 | SSH | OpenSSH server |
 | NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
 | NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
-| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
+| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
 | Builder | Debian 12 host/VM or Debian 12 container image |

 ## Operator UX
--- a/bible-local/decisions/2026-03-05-nvidia-proprietary-driver.md
+++ b/bible-local/decisions/2026-03-05-nvidia-proprietary-driver.md
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
 - Kernel modules and nvidia-smi come from a single verified source.
 - NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
 - Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
+- DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
+- For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
 - Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
 - Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
 - ISO size increases by ~50MB for .ko files + nvidia-smi.
--- a/bible-local/decisions/2026-04-01-memtest-build-strategy.md
+++ b/bible-local/decisions/2026-04-01-memtest-build-strategy.md
@@ -0,0 +1,224 @@
+# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
+
+**Date:** 2026-04-01
+**Status:** resolved
+
+## Context
+
+We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
+The commit history shows several distinct attempts:
+
+- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
+- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
+- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
+- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
+- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
+- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
+
+Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
+
+- `lb binary_memtest` does run and installs `memtest86+`
+- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
+- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
+
+So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
+
+Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
+
+- the build now completes successfully because memtest is non-blocking by default
+- `lb binary_memtest` still runs and installs `memtest86+`
+- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
+- but it executes too early for its current target paths:
+  - `binary/boot/grub/grub.cfg` is still missing at hook time
+  - `binary/isolinux/live.cfg` is still missing at hook time
+  - memtest binaries are also still absent in `binary/boot/`
+- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
+- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
+
+So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
+
+Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
+artifact dated 2026-04-01:
+
+- the final ISO does contain `boot/memtest86+x64.bin`
+- the final ISO does contain `boot/memtest86+x64.efi`
+- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
+  and `isolinux/live.cfg`
+- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
+  shipped ISO
+- the regression was in the build-time validator/debug path in `build.sh`
+
+Root cause of the false alarm:
+
+- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
+  successfully listed/extracted members"
+- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
+  observable output as "memtest content missing"
+- this made a reader failure look identical to a missing memtest payload
+- as a result, we re-entered the same memtest investigation loop even though
+  the real ISO was already correct
+
+Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
+
+- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
+  still carried live-build's default memtest layout (`live/memtest.bin`,
+  `live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
+- that mismatch is expected to trigger project recovery, because `bee` requires
+  `boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
+- however, `build.sh` exited before recovery because `set -e` treated a direct
+  `iso_memtest_present` return code of `1` as fatal
+- so the next repeated loop was caused by shell control flow, not by proof that
+  the recovery design itself was wrong
+
+## Known Failed Attempts
+
+These approaches were already tried and should not be repeated blindly:
+
+1. Built-in live-build memtest only.
+Reason it failed:
+- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
+
+2. Fixing only the memtest file names for Debian Bookworm.
+Reason it failed:
+- correct file names alone do not make the files appear in the final ISO.
+
+3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
+Reason it failed:
+- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
+
+4. Fallback extraction from cached `memtest86+` `.deb`.
+Reason it failed:
+- this was explored already and was not enough to stabilize the final ISO path end-to-end.
+
+5. Restoring explicit memtest menu entries in source bootloader templates only.
+Reason it failed:
+- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
+
+6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
+Reason it failed:
+- the hook runs before those files exist, so the hook cannot patch them there.
+
+## What This Means
+
+When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
+
+- do not assume the built-in memtest stage is sufficient
+- do not assume `chroot/boot/` will contain memtest payloads
+- do not assume source bootloader templates are the last writer of final ISO configs
+- do not assume the current normal binary hook timing is late enough for final patching
+
+Any future memtest fix must explicitly identify:
+
+- where the memtest binaries are reliably available at build time
+- which exact build stage writes the final bootloader configs that land in the ISO
+- and a post-build proof from a real ISO, not only from intermediate workdir files
+- whether the ISO inspection step itself succeeded, rather than merely whether
+  the validator printed a memtest warning
+- whether a non-zero probe is intentionally handled inside an `if` / `case`
+  context rather than accidentally tripping `set -e`
+
+## Decision
+
+For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
+
+Project rules from now on:
+
+- Do **not** trust `--memtest memtest86+` by itself.
+- A memtest implementation is considered valid only if the produced ISO actually contains:
+  - `boot/memtest86+x64.bin`
+  - `boot/memtest86+x64.efi`
+  - a GRUB menu entry
+  - an isolinux menu entry
+- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
+  - a binary hook copying files into `binary/boot/`
+  - extraction from the cached `memtest86+` `.deb`
+  - another deterministic build-time copy step
+- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
+
+Current implementation direction:
+
+- keep the live-build memtest stage enabled if it helps package acquisition
+- do not rely on the current early `binary_hooks` timing for final patching
+- prefer a post-`lb build` recovery step in `build.sh` that:
+  - patches the fully materialized `LB_DIR/binary` tree
+  - injects memtest binaries there
+  - ensures final bootloader entries there
+  - reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
+- also treat ISO validation tooling as part of the critical path:
+  - install a stable ISO reader in the builder image
+  - fail with an explicit reader error if ISO listing/extraction fails
+  - do not treat reader failure as evidence that memtest is missing
+  - do not call a probe that may return "needs recovery" as a bare command under
+    `set -e`; wrap it in explicit control flow
+
+## Consequences
+
+- Future memtest changes must begin by reading this ADR and the commits listed above.
+- Future memtest changes must also begin by reading the failed-attempt list above.
+- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
+- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
+- But validation output is only trustworthy if ISO reading itself succeeded. A
+  "missing memtest" warning without a successful ISO read is not evidence.
+- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
+
+## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
+
+This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
+and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
+
+### Components
+
+**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
+
+Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
+those files may not exist yet. Instead:
+
+- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
+- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
+- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
+  If they do not exist, the hook warns and continues (does not fail).
+
+Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
+
+**2. Post-`lb build` recovery step in `build.sh`**
+
+After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
+contains all required memtest artifacts. If not:
+
+- Copies/extracts memtest binaries into `binary/boot/`.
+- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
+- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
+  the ISO with the patched tree.
+
+This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
+step handles the final `binary/` tree after live-build has written all bootloader configs.
+
+**3. ISO validation hardening**
+
+The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
+as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
+handled — it does not abort the build prematurely.
+
+ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
+If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
+This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
+
+### Why this works when earlier attempts did not
+
+The earlier patterns all shared a single flaw: they assumed a single build-time point
+(hook or source template) would be the last writer of bootloader configs and memtest payloads.
+In live-build on Debian Bookworm that assumption is false — live-build continues writing
+bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
+
+The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
+`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
+There is no ordering dependency to get wrong.
+
+### Do not revert
+
+Do not remove the recovery step or the hook without a fresh real ISO build proving
+live-build alone produces all four required artifacts:
+- `boot/memtest86+x64.bin`
+- `boot/memtest86+x64.efi`
+- memtest entry in `boot/grub/grub.cfg`
+- memtest entry in `isolinux/live.cfg`
--- a/bible-local/decisions/README.md
+++ b/bible-local/decisions/README.md
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
 | Date | Decision | Status |
 |---|---|---|
 | 2026-03-05 | Use NVIDIA proprietary driver | active |
+| 2026-04-01 | Treat memtest as explicit ISO content | active |
--- a/bible-local/docs/iso-build-rules.md
+++ b/bible-local/docs/iso-build-rules.md
@@ -0,0 +1,62 @@
+# ISO Build Rules
+
+## Verify package names before use
+
+ISO builds take 30–60 minutes. A wrong package name wastes an entire build cycle.
+
+**Rule: before adding any Debian package name to the ISO config, verify it exists and check its file list.**
+
+Use one of:
+- `https://packages.debian.org/bookworm/<package-name>` — existence + description
+- `https://packages.debian.org/bookworm/amd64/<package-name>/filelist` — exact files installed
+- `apt-cache show <package>` inside a Debian bookworm container
+
+This applies to:
+- `iso/builder/config/package-lists/*.list.chroot`
+- Any package referenced in bootloader configs, hooks, or overlay scripts
+
+## Memtest rule
+
+Do not assume live-build's built-in memtest integration is sufficient for `bee`.
+We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
+ran, but the final ISO still lacked memtest binaries and menu entries.
+
+For this project, memtest is accepted only when the produced ISO actually
+contains all of the following:
+
+- `boot/memtest86+x64.bin`
+- `boot/memtest86+x64.efi`
+- a memtest entry in `boot/grub/grub.cfg`
+- a memtest entry in `isolinux/live.cfg`
+
+Rules:
+
+- Keep explicit post-build memtest validation in `build.sh`.
+- Treat ISO reader success as a separate prerequisite from memtest content.
+  If the reader cannot list or extract from the ISO, that is a validator
+  failure, not proof that memtest is missing.
+- If built-in integration does not produce the artifacts above, use a
+  deterministic project-owned copy/extract step instead of hoping live-build
+  will "start working".
+- Do not switch back to built-in-only memtest without fresh build evidence from
+  a real ISO.
+- If you reference memtest files manually, verify the exact package file list
+  first for the target Debian release.
+
+Known bad loops for this repository:
+
+- Do not retry built-in-only memtest without new evidence. We already proved
+  that `lb binary_memtest` can run while the final ISO still has no memtest.
+- Do not assume fixing memtest file names is enough. Correct names did not fix
+  the final artifact path.
+- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
+- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
+  bootloader configs.
+- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
+  timing is late enough to patch final `binary/boot/grub/grub.cfg` or
+  `binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
+  present yet when the hook executed.
+- Do not treat a validator warning as ground truth until you have confirmed the
+  ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
+  regression because the final ISO was correct but the validator produced a
+  false negative.
--- a/bible-local/docs/validate-vs-burn.md
+++ b/bible-local/docs/validate-vs-burn.md
@@ -0,0 +1,35 @@
+# Validate vs Burn: Hardware Impact Policy
+
+## Validate Tests (non-destructive)
+
+Tests on the **Validate** page are purely diagnostic. They:
+
+- **Do not write to disks** — no data is written to storage devices; SMART counters (power-on hours, load cycle count, reallocated sectors) are not incremented.
+- **Do not run sustained high load** — commands complete quickly (seconds to minutes) and do not push hardware to thermal or electrical limits.
+- **Do not increment hardware wear counters** — GPU memory ECC counters, NVMe wear leveling counters, and similar endurance metrics are unaffected.
+- **Are safe to run repeatedly** — on new, production-bound, or already-deployed hardware without concern for reducing lifespan.
+
+### What Validate tests actually do
+
+| Test | What it runs |
+|---|---|
+| NVIDIA GPU | `nvidia-smi`, `dcgmi diag` (levels 1–4 read-only diagnostics) |
+| Memory | `memtester` on a limited allocation; reads/writes to RAM only |
+| Storage | `smartctl -a`, `nvme smart-log` — reads SMART data only |
+| CPU | `stress-ng` for a bounded duration; CPU-only, no I/O |
+| AMD GPU | `rocm-smi --showallinfo`, `dmidecode` — read-only queries |
+
+## Burn Tests (hardware wear)
+
+Tests on the **Burn** page run hardware at maximum or near-maximum load for extended durations. They:
+
+- **Wear storage**: write-intensive patterns can reduce SSD endurance (P/E cycles).
+- **Stress GPU memory**: extended ECC stress tests may surface latent defects but also exercise memory cells.
+- **Accelerate thermal cycling**: repeated heat/cool cycles degrade solder joints and capacitors over time.
+- **May increment wear counters**: GPU power-on hours, NVMe media wear indicator, and similar metrics will advance.
+
+### Rule
+
+> Run **Validate** freely on any server, at any time, before or after deployment.
+> Run **Burn** only when explicitly required (e.g., initial acceptance after repair, or per customer SLA).
+> Document when and why Burn tests were run.
--- a/iso/README.md
+++ b/iso/README.md
@@ -48,6 +48,7 @@ sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
 - The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
 - The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
 - Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
+- The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
 - Override the container platform only if you know why:

 ```sh
--- a/iso/builder/Dockerfile
+++ b/iso/builder/Dockerfile
@@ -17,12 +17,23 @@ RUN apt-get update -qq && apt-get install -y \
    wget \
    curl \
    tar \
+    libarchive-tools \
    xz-utils \
    rsync \
    build-essential \
    gcc \
    make \
    perl \
+    pkg-config \
+    yasm \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libgmp-dev \
+    libpcap-dev \
+    libsqlite3-dev \
+    libcurl4-openssl-dev \
+    ocl-icd-opencl-dev \
    linux-headers-amd64 \
    && rm -rf /var/lib/apt/lists/*

--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -8,8 +8,16 @@ NCCL_TESTS_VERSION=2.13.10
 NVCC_VERSION=12.8
 CUBLAS_VERSION=13.0.2.14-1
 CUDA_USERSPACE_VERSION=13.0.96-1
-DCGM_VERSION=3.3.9
+DCGM_VERSION=4.5.3-1
+JOHN_JUMBO_COMMIT=67fcf9fe5a
 ROCM_VERSION=6.3.4
 ROCM_SMI_VERSION=7.4.0.60304-76~22.04
+ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
+ROCM_VALIDATION_SUITE_VERSION=1.1.0.60304-76~22.04
+ROCBLAS_VERSION=4.3.0.60304-76~22.04
+ROCRAND_VERSION=3.2.0.60304-76~22.04
+HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
+HIPBLASLT_VERSION=0.10.0.60304-76~22.04
+COMGR_VERSION=2.8.0.60304-76~22.04
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -29,10 +29,10 @@ lb config noauto \
    --security true \
    --linux-flavours "amd64" \
    --linux-packages "${LB_LINUX_PACKAGES}" \
-    --memtest none \
-    --iso-volume "EASY-BEE" \
-    --iso-application "EASY-BEE" \
-    --bootappend-live "boot=live components nomodeset video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
+    --memtest memtest86+ \
+    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
+    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
+    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -29,8 +29,14 @@ typedef void *CUfunction;
 typedef void *CUstream;

 #define CU_SUCCESS 0
+#define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
+#define MAX_STRESS_STREAMS 16
+#define MAX_CUBLAS_PROFILES 5
+#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
+#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
+#define STRESS_LAUNCH_DEPTH 8

 static const char *ptx_source =
    ".version 6.0\n"
@@ -97,6 +103,9 @@ typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
                                      CUstream,
                                      void **,
                                      void **);
+typedef CUresult (*cuMemGetInfo_fn)(size_t *, size_t *);
+typedef CUresult (*cuStreamCreate_fn)(CUstream *, unsigned int);
+typedef CUresult (*cuStreamDestroy_fn)(CUstream);
 typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
 typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);

@@ -118,6 +127,9 @@ struct cuda_api {
    cuModuleLoadDataEx_fn cuModuleLoadDataEx;
    cuModuleGetFunction_fn cuModuleGetFunction;
    cuLaunchKernel_fn cuLaunchKernel;
+    cuMemGetInfo_fn cuMemGetInfo;
+    cuStreamCreate_fn cuStreamCreate;
+    cuStreamDestroy_fn cuStreamDestroy;
    cuGetErrorName_fn cuGetErrorName;
    cuGetErrorString_fn cuGetErrorString;
 };
@@ -128,9 +140,10 @@ struct stress_report {
    int cc_major;
    int cc_minor;
    int buffer_mb;
+    int stream_count;
    unsigned long iterations;
    uint64_t checksum;
-    char details[1024];
+    char details[16384];
 };

 static int load_symbol(void *lib, const char *name, void **out) {
@@ -144,7 +157,7 @@ static int load_cuda(struct cuda_api *api) {
    if (!api->lib) {
        return 0;
    }
-    return
+    if (!(
        load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
        load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
        load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
@@ -160,7 +173,17 @@ static int load_cuda(struct cuda_api *api) {
        load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
        load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
        load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
-        load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
+        load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel))) {
+        dlclose(api->lib);
+        memset(api, 0, sizeof(*api));
+        return 0;
+    }
+    load_symbol(api->lib, "cuMemGetInfo_v2", (void **)&api->cuMemGetInfo);
+    load_symbol(api->lib, "cuStreamCreate", (void **)&api->cuStreamCreate);
+    if (!load_symbol(api->lib, "cuStreamDestroy_v2", (void **)&api->cuStreamDestroy)) {
+        load_symbol(api->lib, "cuStreamDestroy", (void **)&api->cuStreamDestroy);
+    }
+    return 1;
 }

 static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
@@ -193,14 +216,12 @@ static double now_seconds(void) {
    return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
 }

-#if HAVE_CUBLASLT_HEADERS
 static size_t round_down_size(size_t value, size_t multiple) {
    if (multiple == 0 || value < multiple) {
        return value;
    }
    return value - (value % multiple);
 }
-#endif

 static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
    int cc_major = 0;
@@ -220,6 +241,75 @@ static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *maj
    return 1;
 }

+static int query_multiprocessor_count(struct cuda_api *api, CUdevice dev, int *count) {
+    int mp_count = 0;
+    if (!check_rc(api,
+                  "cuDeviceGetAttribute(multiprocessors)",
+                  api->cuDeviceGetAttribute(&mp_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev))) {
+        return 0;
+    }
+    *count = mp_count;
+    return 1;
+}
+
+static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested_bytes) {
+    size_t free_bytes = 0;
+    size_t total_bytes = 0;
+    size_t max_bytes = requested_bytes;
+
+    if (!api->cuMemGetInfo) {
+        return requested_bytes;
+    }
+    if (api->cuMemGetInfo(&free_bytes, &total_bytes) != CU_SUCCESS || free_bytes == 0) {
+        return requested_bytes;
+    }
+
+    max_bytes = (free_bytes * 9u) / 10u;
+    if (max_bytes < (size_t)4u * 1024u * 1024u) {
+        max_bytes = (size_t)4u * 1024u * 1024u;
+    }
+    if (requested_bytes > max_bytes) {
+        return max_bytes;
+    }
+    return requested_bytes;
+}
+
+static int choose_stream_count(int mp_count, int planned_profiles, size_t total_budget, int have_streams) {
+    int stream_count = 1;
+    if (!have_streams || mp_count <= 0 || planned_profiles <= 0) {
+        return 1;
+    }
+
+    stream_count = mp_count / 8;
+    if (stream_count < 2) {
+        stream_count = 2;
+    }
+    if (stream_count > MAX_STRESS_STREAMS) {
+        stream_count = MAX_STRESS_STREAMS;
+    }
+
+    while (stream_count > 1) {
+        size_t per_stream_budget = total_budget / ((size_t)planned_profiles * (size_t)stream_count);
+        if (per_stream_budget >= MIN_STREAM_BUDGET_BYTES) {
+            break;
+        }
+        stream_count--;
+    }
+    return stream_count;
+}
+
+static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
+    if (!api->cuStreamDestroy) {
+        return;
+    }
+    for (int i = 0; i < count; i++) {
+        if (streams[i]) {
+            api->cuStreamDestroy(streams[i]);
+            streams[i] = NULL;
+        }
+    }
+}
+
 #if HAVE_CUBLASLT_HEADERS
 static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
    size_t len = strlen(buf);
@@ -242,12 +332,19 @@ static int run_ptx_fallback(struct cuda_api *api,
                            int size_mb,
                            struct stress_report *report) {
    CUcontext ctx = NULL;
-    CUdeviceptr device_mem = 0;
    CUmodule module = NULL;
    CUfunction kernel = NULL;
    uint32_t sample[256];
-    uint32_t words = 0;
+    CUdeviceptr device_mem[MAX_STRESS_STREAMS] = {0};
+    CUstream streams[MAX_STRESS_STREAMS] = {0};
+    uint32_t words[MAX_STRESS_STREAMS] = {0};
+    uint32_t rounds[MAX_STRESS_STREAMS] = {0};
+    void *params[MAX_STRESS_STREAMS][3];
+    size_t bytes_per_stream[MAX_STRESS_STREAMS] = {0};
    unsigned long iterations = 0;
+    int mp_count = 0;
+    int stream_count = 1;
+    int launches_per_wave = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "driver-ptx");
@@ -260,64 +357,109 @@ static int run_ptx_fallback(struct cuda_api *api,
        return 0;
    }

-    size_t bytes = (size_t)size_mb * 1024u * 1024u;
-    if (bytes < 4u * 1024u * 1024u) {
-        bytes = 4u * 1024u * 1024u;
+    size_t requested_bytes = (size_t)size_mb * 1024u * 1024u;
+    if (requested_bytes < MIN_PROFILE_BUDGET_BYTES) {
+        requested_bytes = MIN_PROFILE_BUDGET_BYTES;
    }
-    if (bytes > (size_t)1024u * 1024u * 1024u) {
-        bytes = (size_t)1024u * 1024u * 1024u;
+    size_t total_bytes = clamp_budget_to_free_memory(api, requested_bytes);
+    if (total_bytes < MIN_PROFILE_BUDGET_BYTES) {
+        total_bytes = MIN_PROFILE_BUDGET_BYTES;
    }
-    words = (uint32_t)(bytes / sizeof(uint32_t));
+    report->buffer_mb = (int)(total_bytes / (1024u * 1024u));

-    if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem, bytes))) {
-        api->cuCtxDestroy(ctx);
-        return 0;
+    if (query_multiprocessor_count(api, dev, &mp_count) &&
+        api->cuStreamCreate &&
+        api->cuStreamDestroy) {
+        stream_count = choose_stream_count(mp_count, 1, total_bytes, 1);
    }
-    if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem, 0, bytes))) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
+    if (stream_count > 1) {
+        int created = 0;
+        for (; created < stream_count; created++) {
+            if (!check_rc(api, "cuStreamCreate", api->cuStreamCreate(&streams[created], 0))) {
+                destroy_streams(api, streams, created);
+                stream_count = 1;
+                break;
+            }
+        }
    }
+    report->stream_count = stream_count;
+
+    for (int lane = 0; lane < stream_count; lane++) {
+        size_t slice = total_bytes / (size_t)stream_count;
+        if (lane == stream_count - 1) {
+            slice = total_bytes - ((size_t)lane * (total_bytes / (size_t)stream_count));
+        }
+        slice = round_down_size(slice, sizeof(uint32_t));
+        if (slice < MIN_PROFILE_BUDGET_BYTES) {
+            slice = MIN_PROFILE_BUDGET_BYTES;
+        }
+        bytes_per_stream[lane] = slice;
+        words[lane] = (uint32_t)(slice / sizeof(uint32_t));
+
+        if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem[lane], slice))) {
+            goto fail;
+        }
+        if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem[lane], 0, slice))) {
+            goto fail;
+        }
+        rounds[lane] = 2048;
+        params[lane][0] = &device_mem[lane];
+        params[lane][1] = &words[lane];
+        params[lane][2] = &rounds[lane];
+    }
+
    if (!check_rc(api,
                  "cuModuleLoadDataEx",
                  api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
+        goto fail;
    }
    if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
+        goto fail;
    }

    unsigned int threads = 256;
-    unsigned int blocks = (unsigned int)((words + threads - 1) / threads);
-    uint32_t rounds = 1024;
-    void *params[] = {&device_mem, &words, &rounds};

    double start = now_seconds();
    double deadline = start + (double)seconds;
    while (now_seconds() < deadline) {
-        if (!check_rc(api,
-                      "cuLaunchKernel",
-                      api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
-            api->cuMemFree(device_mem);
-            api->cuCtxDestroy(ctx);
-            return 0;
+        launches_per_wave = 0;
+        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
+            int launched_this_batch = 0;
+            for (int lane = 0; lane < stream_count; lane++) {
+                unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
+                if (!check_rc(api,
+                              "cuLaunchKernel",
+                              api->cuLaunchKernel(kernel,
+                                                  blocks,
+                                                  1,
+                                                  1,
+                                                  threads,
+                                                  1,
+                                                  1,
+                                                  0,
+                                                  streams[lane],
+                                                  params[lane],
+                                                  NULL))) {
+                    goto fail;
+                }
+                launches_per_wave++;
+                launched_this_batch++;
+            }
+            if (launched_this_batch <= 0) {
+                break;
+            }
        }
-        iterations++;
+        if (launches_per_wave <= 0) {
+            goto fail;
+        }
+        if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
+            goto fail;
+        }
+        iterations += (unsigned long)launches_per_wave;
    }

-    if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
-    }
-    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem, sizeof(sample)))) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
+    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
+        goto fail;
    }

    for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
@@ -326,12 +468,34 @@ static int run_ptx_fallback(struct cuda_api *api,
    report->iterations = iterations;
    snprintf(report->details,
             sizeof(report->details),
-             "profile_int32_fallback=OK iterations=%lu\n",
+             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
+             size_mb,
+             report->buffer_mb,
+             report->stream_count,
+             STRESS_LAUNCH_DEPTH,
+             bytes_per_stream[0] / (1024u * 1024u),
             iterations);

-    api->cuMemFree(device_mem);
+    for (int lane = 0; lane < stream_count; lane++) {
+        if (device_mem[lane]) {
+            api->cuMemFree(device_mem[lane]);
+        }
+    }
+    destroy_streams(api, streams, stream_count);
    api->cuCtxDestroy(ctx);
    return 1;
+
+fail:
+    for (int lane = 0; lane < MAX_STRESS_STREAMS; lane++) {
+        if (device_mem[lane]) {
+            api->cuMemFree(device_mem[lane]);
+        }
+    }
+    destroy_streams(api, streams, MAX_STRESS_STREAMS);
+    if (ctx) {
+        api->cuCtxDestroy(ctx);
+    }
+    return 0;
 }

 #if HAVE_CUBLASLT_HEADERS
@@ -418,6 +582,7 @@ struct profile_desc {

 struct prepared_profile {
    struct profile_desc desc;
+    CUstream stream;
    cublasLtMatmulDesc_t op_desc;
    cublasLtMatrixLayout_t a_layout;
    cublasLtMatrixLayout_t b_layout;
@@ -617,8 +782,8 @@ static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, in
    if (dim < (uint64_t)multiple) {
        dim = (uint64_t)multiple;
    }
-    if (dim > 8192u) {
-        dim = 8192u;
+    if (dim > 65536u) {
+        dim = 65536u;
    }
    return dim;
 }
@@ -704,10 +869,12 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           cublasLtHandle_t handle,
                           struct cuda_api *cuda,
                           const struct profile_desc *desc,
+                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
    memset(out, 0, sizeof(*out));
    out->desc = *desc;
+    out->stream = stream;

    size_t bytes_per_cell = 0;
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
@@ -935,7 +1102,7 @@ static int run_cublas_profile(cublasLtHandle_t handle,
                                               &profile->heuristic.algo,
                                               (void *)(uintptr_t)profile->workspace_dev,
                                               profile->workspace_size,
-                                               (cudaStream_t)0));
+                                               profile->stream));
 }

 static int run_cublaslt_stress(struct cuda_api *cuda,
@@ -947,13 +1114,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                               int size_mb,
                               struct stress_report *report) {
    struct cublaslt_api cublas;
-    struct prepared_profile prepared[sizeof(k_profiles) / sizeof(k_profiles[0])];
+    struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
    cublasLtHandle_t handle = NULL;
    CUcontext ctx = NULL;
+    CUstream streams[MAX_STRESS_STREAMS] = {0};
    uint16_t sample[256];
    int cc = cc_major * 10 + cc_minor;
    int planned = 0;
    int active = 0;
+    int mp_count = 0;
+    int stream_count = 1;
+    int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
+    int prepared_count = 0;
+    int wave_launches = 0;
+    size_t requested_budget = 0;
+    size_t total_budget = 0;
+    size_t per_profile_budget = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -986,16 +1162,46 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

-    size_t total_budget = (size_t)size_mb * 1024u * 1024u;
-    if (total_budget < (size_t)planned * 4u * 1024u * 1024u) {
-        total_budget = (size_t)planned * 4u * 1024u * 1024u;
+    requested_budget = (size_t)size_mb * 1024u * 1024u;
+    if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
+        requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
    }
-    size_t per_profile_budget = total_budget / (size_t)planned;
-    if (per_profile_budget < 4u * 1024u * 1024u) {
-        per_profile_budget = 4u * 1024u * 1024u;
+    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
+    if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
+        total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
    }
+    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
+        cuda->cuStreamCreate &&
+        cuda->cuStreamDestroy) {
+        stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
+    }
+    if (stream_count > 1) {
+        int created = 0;
+        for (; created < stream_count; created++) {
+            if (!check_rc(cuda, "cuStreamCreate", cuda->cuStreamCreate(&streams[created], 0))) {
+                destroy_streams(cuda, streams, created);
+                stream_count = 1;
+                break;
+            }
+        }
+    }
+    report->stream_count = stream_count;
+    per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
+    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
+        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
+    }
+    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
+    append_detail(report->details,
+                  sizeof(report->details),
+                  "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
+                  size_mb,
+                  report->buffer_mb,
+                  report->stream_count,
+                  STRESS_LAUNCH_DEPTH,
+                  mp_count,
+                  per_profile_budget / (1024u * 1024u));

-    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
+    for (int i = 0; i < profile_count; i++) {
        const struct profile_desc *desc = &k_profiles[i];
        if (!(desc->enabled && cc >= desc->min_cc)) {
            append_detail(report->details,
@@ -1005,63 +1211,87 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
-        if (prepare_profile(&cublas, handle, cuda, desc, per_profile_budget, &prepared[i])) {
-            active++;
-            append_detail(report->details,
-                          sizeof(report->details),
-                          "%s=READY dim=%llux%llux%llu block=%s\n",
-                          desc->name,
-                          (unsigned long long)prepared[i].m,
-                          (unsigned long long)prepared[i].n,
-                          (unsigned long long)prepared[i].k,
-                          desc->block_label);
-        } else {
-            append_detail(report->details, sizeof(report->details), "%s=SKIPPED unsupported\n", desc->name);
+        for (int lane = 0; lane < stream_count; lane++) {
+            CUstream stream = streams[lane];
+            if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
+                break;
+            }
+            if (prepare_profile(&cublas, handle, cuda, desc, stream, per_profile_budget, &prepared[prepared_count])) {
+                active++;
+                append_detail(report->details,
+                              sizeof(report->details),
+                              "%s[%d]=READY dim=%llux%llux%llu block=%s stream=%d\n",
+                              desc->name,
+                              lane,
+                              (unsigned long long)prepared[prepared_count].m,
+                              (unsigned long long)prepared[prepared_count].n,
+                              (unsigned long long)prepared[prepared_count].k,
+                              desc->block_label,
+                              lane);
+                prepared_count++;
+            } else {
+                append_detail(report->details,
+                              sizeof(report->details),
+                              "%s[%d]=SKIPPED unsupported\n",
+                              desc->name,
+                              lane);
+            }
        }
    }

    if (active <= 0) {
        cublas.cublasLtDestroy(handle);
+        destroy_streams(cuda, streams, stream_count);
        cuda->cuCtxDestroy(ctx);
        return 0;
    }

    double deadline = now_seconds() + (double)seconds;
    while (now_seconds() < deadline) {
-        for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
-            if (!prepared[i].ready) {
-                continue;
-            }
-            if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
-                append_detail(report->details,
-                              sizeof(report->details),
-                              "%s=FAILED runtime\n",
-                              prepared[i].desc.name);
-                for (size_t j = 0; j < sizeof(prepared) / sizeof(prepared[0]); j++) {
-                    destroy_profile(&cublas, cuda, &prepared[j]);
+        wave_launches = 0;
+        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
+            int launched_this_batch = 0;
+            for (int i = 0; i < prepared_count; i++) {
+                if (!prepared[i].ready) {
+                    continue;
                }
-                cublas.cublasLtDestroy(handle);
-                cuda->cuCtxDestroy(ctx);
-                return 0;
+                if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
+                    append_detail(report->details,
+                                  sizeof(report->details),
+                                  "%s=FAILED runtime\n",
+                                  prepared[i].desc.name);
+                    for (int j = 0; j < prepared_count; j++) {
+                        destroy_profile(&cublas, cuda, &prepared[j]);
+                    }
+                    cublas.cublasLtDestroy(handle);
+                    destroy_streams(cuda, streams, stream_count);
+                    cuda->cuCtxDestroy(ctx);
+                    return 0;
+                }
+                prepared[i].iterations++;
+                report->iterations++;
+                wave_launches++;
+                launched_this_batch++;
            }
-            prepared[i].iterations++;
-            report->iterations++;
-            if (now_seconds() >= deadline) {
+            if (launched_this_batch <= 0) {
                break;
            }
        }
-    }
-
-    if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
-        for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
-            destroy_profile(&cublas, cuda, &prepared[i]);
+        if (wave_launches <= 0) {
+            break;
+        }
+        if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
+            for (int i = 0; i < prepared_count; i++) {
+                destroy_profile(&cublas, cuda, &prepared[i]);
+            }
+            cublas.cublasLtDestroy(handle);
+            destroy_streams(cuda, streams, stream_count);
+            cuda->cuCtxDestroy(ctx);
+            return 0;
        }
-        cublas.cublasLtDestroy(handle);
-        cuda->cuCtxDestroy(ctx);
-        return 0;
    }

-    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+    for (int i = 0; i < prepared_count; i++) {
        if (!prepared[i].ready) {
            continue;
        }
@@ -1072,7 +1302,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                      prepared[i].iterations);
    }

-    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+    for (int i = 0; i < prepared_count; i++) {
        if (prepared[i].ready) {
            if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
                for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
@@ -1083,10 +1313,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }

-    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+    for (int i = 0; i < prepared_count; i++) {
        destroy_profile(&cublas, cuda, &prepared[i]);
    }
    cublas.cublasLtDestroy(handle);
+    destroy_streams(cuda, streams, stream_count);
    cuda->cuCtxDestroy(ctx);
    return 1;
 }
@@ -1095,13 +1326,16 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
 int main(int argc, char **argv) {
    int seconds = 5;
    int size_mb = 64;
+    int device_index = 0;
    for (int i = 1; i < argc; i++) {
        if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
            seconds = atoi(argv[++i]);
        } else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
            size_mb = atoi(argv[++i]);
+        } else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
+            device_index = atoi(argv[++i]);
        } else {
-            fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
+            fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
            return 2;
        }
    }
@@ -1111,6 +1345,9 @@ int main(int argc, char **argv) {
    if (size_mb <= 0) {
        size_mb = 64;
    }
+    if (device_index < 0) {
+        device_index = 0;
+    }

    struct cuda_api cuda;
    if (!load_cuda(&cuda)) {
@@ -1133,8 +1370,13 @@ int main(int argc, char **argv) {
        return 1;
    }

+    if (device_index >= count) {
+        fprintf(stderr, "device index %d out of range (found %d CUDA device(s))\n", device_index, count);
+        return 1;
+    }
+
    CUdevice dev = 0;
-    if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
+    if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, device_index))) {
        return 1;
    }

@@ -1162,10 +1404,12 @@ int main(int argc, char **argv) {
    }

    printf("device=%s\n", report.device);
+    printf("device_index=%d\n", device_index);
    printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
    printf("backend=%s\n", report.backend);
    printf("duration_s=%d\n", seconds);
    printf("buffer_mb=%d\n", report.buffer_mb);
+    printf("streams=%d\n", report.stream_count);
    printf("iterations=%lu\n", report.iterations);
    printf("checksum=%llu\n", (unsigned long long)report.checksum);
    if (report.details[0] != '\0') {
--- a/iso/builder/build-cublas.sh
+++ b/iso/builder/build-cublas.sh
@@ -1,9 +1,9 @@
 #!/bin/sh
-# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
+# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
 #
 # Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
 # verifies them against Packages.gz, and extracts the small subset we need:
-#   - headers for compiling bee-gpu-stress against cuBLASLt
+#   - headers for compiling bee-gpu-burn worker against cuBLASLt
 #   - runtime libs for libcublas, libcublasLt, libcudart inside the ISO

 set -e
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
 AUTH_KEYS=""
 REBUILD_IMAGE=0
 CLEAN_CACHE=0
+VARIANT="all"

 . "${BUILDER_DIR}/VERSIONS"

@@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do
            REBUILD_IMAGE=1
            shift
            ;;
+        --variant)
+            VARIANT="$2"
+            shift 2
+            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2
+            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
            exit 1
            ;;
    esac
 done

+case "$VARIANT" in
+    nvidia|amd|nogpu|all) ;;
+    *) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
+esac
+
 if [ "$CLEAN_CACHE" = "1" ]; then
    echo "=== cleaning build cache: ${CACHE_DIR} ==="
    rm -rf "${CACHE_DIR:?}/go-build" \
@@ -49,8 +59,10 @@ if [ "$CLEAN_CACHE" = "1" ]; then
           "${CACHE_DIR:?}/tmp" \
           "${CACHE_DIR:?}/bee" \
           "${CACHE_DIR:?}/lb-packages"
-    echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ==="
-    rm -rf "${REPO_ROOT}/dist/live-build-work"
+    echo "=== cleaning live-build work dirs ==="
+    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
+    rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
+    rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
    echo "=== caches cleared, proceeding with build ==="
 fi

@@ -108,34 +120,75 @@ else
    echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
 fi

-set -- \
-    run --rm --privileged \
-    --platform "${BUILDER_PLATFORM}" \
-    -v "${REPO_ROOT}:/work" \
-    -v "${CACHE_DIR}:/cache" \
-    -e BEE_CONTAINER_BUILD=1 \
-    -e GOCACHE=/cache/go-build \
-    -e GOMODCACHE=/cache/go-mod \
-    -e TMPDIR=/cache/tmp \
-    -e BEE_CACHE_DIR=/cache/bee \
-    -w /work \
-    "${IMAGE_REF}" \
-    sh /work/iso/builder/build.sh
-
-if [ -n "$AUTH_KEYS" ]; then
-    set -- run --rm --privileged \
-        --platform "${BUILDER_PLATFORM}" \
-        -v "${REPO_ROOT}:/work" \
-        -v "${CACHE_DIR}:/cache" \
-        -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
+# Build base docker run args (without --authorized-keys)
+build_run_args() {
+    _variant="$1"
+    _auth_arg=""
+    if [ -n "$AUTH_KEYS" ]; then
+        _auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
+    fi
+    echo "run --rm --privileged \
+        --platform ${BUILDER_PLATFORM} \
+        -v ${REPO_ROOT}:/work \
+        -v ${CACHE_DIR}:/cache \
+        ${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
        -e BEE_CONTAINER_BUILD=1 \
        -e GOCACHE=/cache/go-build \
        -e GOMODCACHE=/cache/go-mod \
        -e TMPDIR=/cache/tmp \
        -e BEE_CACHE_DIR=/cache/bee \
        -w /work \
-        "${IMAGE_REF}" \
-        sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
-fi
+        ${IMAGE_REF} \
+        sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
+}

-"$CONTAINER_TOOL" "$@"
+run_variant() {
+    _v="$1"
+    echo "=== building variant: ${_v} ==="
+    if [ -n "$AUTH_KEYS" ]; then
+        "$CONTAINER_TOOL" run --rm --privileged \
+            --platform "${BUILDER_PLATFORM}" \
+            -v "${REPO_ROOT}:/work" \
+            -v "${CACHE_DIR}:/cache" \
+            -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
+            -e BEE_CONTAINER_BUILD=1 \
+            -e GOCACHE=/cache/go-build \
+            -e GOMODCACHE=/cache/go-mod \
+            -e TMPDIR=/cache/tmp \
+            -e BEE_CACHE_DIR=/cache/bee \
+            -w /work \
+            "${IMAGE_REF}" \
+            sh /work/iso/builder/build.sh --variant "${_v}" \
+                --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
+    else
+        "$CONTAINER_TOOL" run --rm --privileged \
+            --platform "${BUILDER_PLATFORM}" \
+            -v "${REPO_ROOT}:/work" \
+            -v "${CACHE_DIR}:/cache" \
+            -e BEE_CONTAINER_BUILD=1 \
+            -e GOCACHE=/cache/go-build \
+            -e GOMODCACHE=/cache/go-mod \
+            -e TMPDIR=/cache/tmp \
+            -e BEE_CACHE_DIR=/cache/bee \
+            -w /work \
+            "${IMAGE_REF}" \
+            sh /work/iso/builder/build.sh --variant "${_v}"
+    fi
+}
+
+case "$VARIANT" in
+    nvidia)
+        run_variant nvidia
+        ;;
+    amd)
+        run_variant amd
+        ;;
+    nogpu)
+        run_variant nogpu
+        ;;
+    all)
+        run_variant nvidia
+        run_variant amd
+        run_variant nogpu
+        ;;
+esac
--- a/iso/builder/build-john.sh
+++ b/iso/builder/build-john.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+# build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
+#
+# Downloads a pinned source snapshot from the official openwall/john repository,
+# builds it inside the builder container, and caches the resulting run/ tree.
+
+set -e
+
+JOHN_COMMIT="$1"
+DIST_DIR="$2"
+
+[ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
+[ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
+
+echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
+
+CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
+CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
+DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
+SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
+SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
+
+if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
+    echo "=== john cached, skipping build ==="
+    echo "run dir: ${CACHE_DIR}/run"
+    exit 0
+fi
+
+mkdir -p "${DOWNLOAD_CACHE_DIR}"
+if [ ! -f "${SRC_TAR}" ]; then
+    echo "=== downloading john source snapshot ==="
+    wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
+fi
+
+BUILD_TMP=$(mktemp -d)
+trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
+
+cd "${BUILD_TMP}"
+tar xf "${SRC_TAR}"
+SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
+[ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
+
+cd "${SRC_DIR}/src"
+echo "=== configuring john ==="
+./configure
+echo "=== building john ==="
+make clean >/dev/null 2>&1 || true
+make -j"$(nproc)"
+
+mkdir -p "${CACHE_DIR}"
+cp -a "../run" "${CACHE_DIR}/run"
+chmod +x "${CACHE_DIR}/run/john"
+
+echo "=== john build complete ==="
+echo "run dir: ${CACHE_DIR}/run"
--- a/iso/builder/build-nccl-tests.sh
+++ b/iso/builder/build-nccl-tests.sh
@@ -9,6 +9,7 @@
 #
 # Output layout:
 #   $CACHE_DIR/bin/all_reduce_perf
+#   $CACHE_DIR/lib/libcudart.so* copied from the nvcc toolchain used to build nccl-tests

 set -e

@@ -30,7 +31,7 @@ CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"

-if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
+if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ] && [ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' 2>/dev/null | wc -l)" -gt 0 ]; then
    echo "=== nccl-tests cached, skipping build ==="
    echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
    exit 0
@@ -52,6 +53,23 @@ echo "nvcc: $NVCC"
 CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
 echo "CUDA_HOME: $CUDA_HOME"

+find_cudart_dir() {
+    for dir in \
+        "${CUDA_HOME}/targets/x86_64-linux/lib" \
+        "${CUDA_HOME}/targets/x86_64-linux/lib/stubs" \
+        "${CUDA_HOME}/lib64" \
+        "${CUDA_HOME}/lib"; do
+        if [ -d "$dir" ] && find "$dir" -maxdepth 1 -name 'libcudart.so*' -type f | grep -q .; then
+            printf '%s\n' "$dir"
+            return 0
+        fi
+    done
+    return 1
+}
+
+CUDART_DIR="$(find_cudart_dir)" || { echo "ERROR: libcudart.so* not found under ${CUDA_HOME}"; exit 1; }
+echo "cudart dir: $CUDART_DIR"
+
 # Download libnccl-dev for nccl.h
 REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
 DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
@@ -136,6 +154,11 @@ mkdir -p "${CACHE_DIR}/bin"
 cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
 chmod +x "${CACHE_DIR}/bin/all_reduce_perf"

+mkdir -p "${CACHE_DIR}/lib"
+find "${CUDART_DIR}" -maxdepth 1 -name 'libcudart.so*' -type f -exec cp -a {} "${CACHE_DIR}/lib/" \;
+[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' -type f | wc -l)" -gt 0 ] || { echo "ERROR: libcudart runtime copy failed"; exit 1; }
+
 echo "=== nccl-tests build complete ==="
 echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
 ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
+ls -lh "${CACHE_DIR}/lib/"libcudart.so* 2>/dev/null || true
--- a/iso/builder/build-nvidia-module.sh
+++ b/iso/builder/build-nvidia-module.sh
@@ -10,7 +10,7 @@
 # Output layout:
 #   $CACHE_DIR/modules/   — nvidia*.ko files
 #   $CACHE_DIR/bin/       — nvidia-smi, nvidia-debugdump
-#   $CACHE_DIR/lib/       — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
+#   $CACHE_DIR/lib/       — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs

 set -e

@@ -46,7 +46,10 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
 EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
+CACHE_LAYOUT_VERSION="2"
+CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
 if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
+        && [ -f "$CACHE_LAYOUT_MARKER" ] \
        && [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
    echo "=== NVIDIA cached, skipping build ==="
    echo "cache: $CACHE_DIR"
@@ -130,17 +133,30 @@ else
    echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
 fi

-# Copy ALL userspace library files.
-# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
-# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
-for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
-    count=0
-    for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
-        cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
-    done
-    if [ "$count" -eq 0 ]; then
-        echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
-        ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
+# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names.
+# Newer driver branches add extra runtime deps (for example OpenCL/compiler side
+# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd
+# but still fail with "no OpenCL platforms" because one dependent .so is absent.
+copied_libs=0
+for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do
+    cp "$f" "$CACHE_DIR/lib/"
+    copied_libs=$((copied_libs+1))
+done
+
+if [ "$copied_libs" -eq 0 ]; then
+    echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR"
+    ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true
+    exit 1
+fi
+
+for lib in \
+    libnvidia-ml \
+    libcuda \
+    libnvidia-ptxjitcompiler \
+    libnvidia-opencl; do
+    if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then
+        echo "ERROR: required ${lib}.so.* not found in extracted userspace libs"
+        ls "$CACHE_DIR/lib/" | sort >&2 || true
        exit 1
    fi
 done
@@ -149,16 +165,17 @@ done
 ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
 [ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }

-# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
-for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
-    versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
-    [ -n "$versioned" ] || continue
+# Create soname symlinks for every copied versioned library.
+for versioned in "$CACHE_DIR"/lib/*.so.*; do
+    [ -f "$versioned" ] || continue
    base=$(basename "$versioned")
-    ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
-    ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
-    echo "${lib}: .so.1 -> $base"
+    stem=${base%%.so.*}
+    ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1"
+    ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true
 done

+touch "$CACHE_LAYOUT_MARKER"
+
 echo "=== NVIDIA build complete ==="
 echo "cache: $CACHE_DIR"
 echo "modules: $ko_count .ko files"
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -10,25 +10,45 @@ echo "  ╚══════╝╚═╝  ╚═╝╚══════╝
 echo ""

 menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
+}
+
+menuentry "EASY-BEE (graphics/KMS)" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }

 menuentry "EASY-BEE (load to RAM)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }

 menuentry "EASY-BEE (NVIDIA GSP=off)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
+}
+
+menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }

 menuentry "EASY-BEE (fail-safe)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
    initrd  @INITRD_LIVE@
 }

+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "Memory Test (memtest86+)" {
+        chainloader /boot/memtest86+x64.efi
+    }
+else
+    menuentry "Memory Test (memtest86+)" {
+        linux16 /boot/memtest86+x64.bin
+    }
+fi
+
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "UEFI Firmware Settings" {
        fwsetup
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -5,6 +5,12 @@ label live-@FLAVOUR@-normal
    initrd @INITRD@
    append @APPEND_LIVE@ bee.nvidia.mode=normal

+label live-@FLAVOUR@-kms
+    menu label EASY-BEE (^graphics/KMS)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
+
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
@@ -15,10 +21,20 @@ label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
+
+label live-@FLAVOUR@-kms-gsp-off
+    menu label EASY-BEE (g^raphics/KMS, GSP=off)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off

 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+
+label memtest
+    menu label ^Memory Test (memtest86+)
+    linux /boot/memtest86+x64.bin
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -5,29 +5,32 @@ set -e

 echo "=== bee chroot setup ==="

+GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)
+echo "=== GPU vendor: ${GPU_VENDOR} ==="
+
 ensure_bee_console_user() {
    if id bee >/dev/null 2>&1; then
-        usermod -d /home/bee -s /bin/sh bee 2>/dev/null || true
+        usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
    else
-        useradd -d /home/bee -m -s /bin/sh -U bee
+        useradd -d /home/bee -m -s /bin/bash -U bee
    fi

    mkdir -p /home/bee
    chown -R bee:bee /home/bee
    echo "bee:eeb" | chpasswd
-    usermod -aG sudo,video,input bee 2>/dev/null || true
+    groupadd -f ipmi 2>/dev/null || true
+    usermod -aG sudo,video,input,render,ipmi bee 2>/dev/null || true
 }

 ensure_bee_console_user

-# Enable bee services
-systemctl enable nvidia-dcgm.service 2>/dev/null || true
+# Enable common bee services
 systemctl enable bee-network.service
-systemctl enable bee-nvidia.service
 systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
 systemctl enable bee-web.service
 systemctl enable bee-sshsetup.service
+systemctl enable bee-selfheal.timer
 systemctl enable ssh.service
 systemctl enable lightdm.service 2>/dev/null || true
 systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -35,23 +38,38 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
 systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
 systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true

+# Enable GPU-vendor specific services
+if [ "$GPU_VENDOR" = "nvidia" ]; then
+    systemctl enable nvidia-dcgm.service 2>/dev/null || true
+    systemctl enable bee-nvidia.service
+elif [ "$GPU_VENDOR" = "amd" ]; then
+    # ROCm symlinks (packages install to /opt/rocm-*/bin/)
+    for tool in rocm-smi rocm-bandwidth-test rvs; do
+        if [ ! -e /usr/local/bin/${tool} ]; then
+            bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
+            [ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
+        fi
+    done
+fi
+# nogpu: no GPU services needed
+
 # Ensure scripts are executable
 chmod +x /usr/local/bin/bee-network.sh  2>/dev/null || true
-chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
 chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal   2>/dev/null || true
+if [ "$GPU_VENDOR" = "nvidia" ]; then
+    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
+    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
+    chmod +x /usr/local/bin/bee-john-gpu-stress 2>/dev/null || true
+    chmod +x /usr/local/bin/bee-nccl-gpu-stress 2>/dev/null || true
+fi

 # Reload udev rules
 udevadm control --reload-rules 2>/dev/null || true

-# rocm-smi symlink (package installs to /opt/rocm-*/bin/rocm-smi)
-if [ ! -e /usr/local/bin/rocm-smi ]; then
-    smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
-    [ -n "${smi_path}" ] && ln -sf "${smi_path}" /usr/local/bin/rocm-smi
-fi
-
 # Create export directory
 mkdir -p /appdata/bee/export

@@ -59,4 +77,4 @@ if [ -f /etc/sudoers.d/bee ]; then
    chmod 0440 /etc/sudoers.d/bee
 fi

-echo "=== bee chroot setup complete ==="
+echo "=== bee chroot setup complete (${GPU_VENDOR}) ==="
--- a/iso/builder/config/hooks/normal/9100-memtest.hook.binary
+++ b/iso/builder/config/hooks/normal/9100-memtest.hook.binary
@@ -0,0 +1,139 @@
+#!/bin/sh
+# Ensure memtest is present in the final ISO even if live-build's built-in
+# memtest stage does not copy the binaries or expose menu entries.
+set -e
+
+: "${BEE_REQUIRE_MEMTEST:=0}"
+
+MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
+BINARY_BOOT_DIR="binary/boot"
+GRUB_CFG="binary/boot/grub/grub.cfg"
+ISOLINUX_CFG="binary/isolinux/live.cfg"
+
+log() {
+    echo "memtest hook: $*"
+}
+
+fail_or_warn() {
+    msg="$1"
+    if [ "${BEE_REQUIRE_MEMTEST}" = "1" ]; then
+        log "ERROR: ${msg}"
+        exit 1
+    fi
+    log "WARNING: ${msg}"
+    return 0
+}
+
+copy_memtest_file() {
+    src="$1"
+    base="$(basename "$src")"
+    dst="${BINARY_BOOT_DIR}/${base}"
+
+    [ -f "$src" ] || return 1
+    mkdir -p "${BINARY_BOOT_DIR}"
+    cp "$src" "$dst"
+    log "copied ${base} from ${src}"
+}
+
+extract_memtest_from_deb() {
+    deb="$1"
+    tmpdir="$(mktemp -d)"
+
+    log "extracting memtest payload from ${deb}"
+    dpkg-deb -x "$deb" "$tmpdir"
+    for f in ${MEMTEST_FILES}; do
+        if [ -f "${tmpdir}/boot/${f}" ]; then
+            copy_memtest_file "${tmpdir}/boot/${f}"
+        fi
+    done
+    rm -rf "$tmpdir"
+}
+
+ensure_memtest_binaries() {
+    missing=0
+    for f in ${MEMTEST_FILES}; do
+        [ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
+    done
+    [ "$missing" -eq 1 ] || return 0
+
+    for root in chroot/boot /boot; do
+        for f in ${MEMTEST_FILES}; do
+            [ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
+        done
+    done
+
+    missing=0
+    for f in ${MEMTEST_FILES}; do
+        [ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
+    done
+    [ "$missing" -eq 1 ] || return 0
+
+    for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
+        [ -d "$root" ] || continue
+        deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
+        [ -n "$deb" ] || continue
+        extract_memtest_from_deb "$deb"
+        break
+    done
+
+    missing=0
+    for f in ${MEMTEST_FILES}; do
+        if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
+            fail_or_warn "missing ${BINARY_BOOT_DIR}/${f}"
+            missing=1
+        fi
+    done
+    [ "$missing" -eq 0 ] || return 0
+}
+
+ensure_grub_entry() {
+    [ -f "$GRUB_CFG" ] || {
+        fail_or_warn "missing ${GRUB_CFG}"
+        return 0
+    }
+
+    grep -q '### BEE MEMTEST ###' "$GRUB_CFG" && return 0
+
+    cat >> "$GRUB_CFG" <<'EOF'
+
+### BEE MEMTEST ###
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "Memory Test (memtest86+)" {
+        chainloader /boot/memtest86+x64.efi
+    }
+else
+    menuentry "Memory Test (memtest86+)" {
+        linux16 /boot/memtest86+x64.bin
+    }
+fi
+### /BEE MEMTEST ###
+EOF
+
+    log "appended memtest entry to ${GRUB_CFG}"
+}
+
+ensure_isolinux_entry() {
+    [ -f "$ISOLINUX_CFG" ] || {
+        fail_or_warn "missing ${ISOLINUX_CFG}"
+        return 0
+    }
+
+    grep -q '### BEE MEMTEST ###' "$ISOLINUX_CFG" && return 0
+
+    cat >> "$ISOLINUX_CFG" <<'EOF'
+
+# ### BEE MEMTEST ###
+label memtest
+    menu label ^Memory Test (memtest86+)
+    linux /boot/memtest86+x64.bin
+# ### /BEE MEMTEST ###
+EOF
+
+    log "appended memtest entry to ${ISOLINUX_CFG}"
+}
+
+log "ensuring memtest binaries and menu entries in binary image"
+ensure_memtest_binaries
+ensure_grub_entry
+ensure_isolinux_entry
+log "memtest assets ready"
--- a/iso/builder/config/package-lists/bee-amd.list.chroot
+++ b/iso/builder/config/package-lists/bee-amd.list.chroot
@@ -0,0 +1,12 @@
+# AMD GPU firmware
+firmware-amd-graphics
+
+# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
+rocm-smi-lib=%%ROCM_SMI_VERSION%%
+rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
+rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
+rocblas=%%ROCBLAS_VERSION%%
+rocrand=%%ROCRAND_VERSION%%
+hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
+hipblaslt=%%HIPBLASLT_VERSION%%
+comgr=%%COMGR_VERSION%%
--- a/iso/builder/config/package-lists/bee-nogpu.list.chroot
+++ b/iso/builder/config/package-lists/bee-nogpu.list.chroot
@@ -0,0 +1 @@
+# No GPU variant — no NVIDIA, no AMD/ROCm packages
--- a/iso/builder/config/package-lists/bee-nvidia.list.chroot
+++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot
@@ -0,0 +1,8 @@
+# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
+# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
+# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
+datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
+datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
+datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
+ocl-icd-libopencl1
+clinfo
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -21,8 +21,15 @@ openssh-server
 # Disk installer
 squashfs-tools
 parted
+# Keep GRUB install tools without selecting a single active platform package.
+# grub-pc and grub-efi-amd64 conflict with each other, but grub2-common
+# provides grub-install/update-grub and the *-bin packages provide BIOS/UEFI modules.
+grub2-common
 grub-pc-bin
 grub-efi-amd64-bin
+grub-efi-amd64-signed
+shim-signed
+efibootmgr

 # Filesystem support for USB export targets
 exfatprogs
@@ -39,11 +46,13 @@ vim-tiny
 mc
 htop
 nvtop
+btop
 sudo
 zstd
 mstflint
 memtester
 stress-ng
+stressapptest

 # QR codes (for displaying audit results)
 qrencode
@@ -62,19 +71,11 @@ lightdm
 firmware-linux-free
 firmware-linux-nonfree
 firmware-misc-nonfree
-firmware-amd-graphics
 firmware-realtek
-firmware-intel-sound
 firmware-bnx2
 firmware-bnx2x
 firmware-cavium
 firmware-qlogic

-# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
-datacenter-gpu-manager=1:%%DCGM_VERSION%%
-
-# AMD ROCm SMI — GPU monitoring for Instinct cards (repo: rocm/apt/6.3.4 jammy)
-rocm-smi-lib=%%ROCM_SMI_VERSION%%
-
 # glibc compat helpers (for any external binaries that need it)
 libc6
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
 # --- PATH & binaries ---
 echo "-- PATH & binaries --"
 for tool in dmidecode smartctl nvme ipmitool lspci bee; do
-    if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
+    if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then
        ok "$tool found: $p"
    else
        fail "$tool: NOT FOUND"
@@ -52,6 +52,14 @@ else
    fail "nvidia-smi: NOT FOUND"
 fi

+for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
+    if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
+        ok "$tool found: $p"
+    else
+        fail "$tool: NOT FOUND"
+    fi
+done
+
 echo ""
 echo "-- NVIDIA modules --"
 KO_DIR="/usr/local/lib/nvidia"
@@ -109,6 +117,40 @@ else
    fail "nvidia-smi: not found in PATH"
 fi

+echo ""
+echo "-- OpenCL / John --"
+if [ -f /etc/OpenCL/vendors/nvidia.icd ]; then
+    ok "OpenCL ICD present: /etc/OpenCL/vendors/nvidia.icd"
+else
+    fail "OpenCL ICD missing: /etc/OpenCL/vendors/nvidia.icd"
+fi
+
+if ldconfig -p 2>/dev/null | grep -q "libnvidia-opencl.so.1"; then
+    ok "libnvidia-opencl.so.1 present in linker cache"
+else
+    fail "libnvidia-opencl.so.1 missing from linker cache"
+fi
+
+if command -v clinfo >/dev/null 2>&1; then
+    if clinfo -l 2>/dev/null | grep -q "Platform"; then
+        ok "clinfo: OpenCL platform detected"
+    else
+        fail "clinfo: no OpenCL platform detected"
+    fi
+else
+    fail "clinfo: not found in PATH"
+fi
+
+if command -v john >/dev/null 2>&1; then
+    if john --list=opencl-devices 2>/dev/null | grep -q "Device #"; then
+        ok "john: OpenCL devices detected"
+    else
+        fail "john: no OpenCL devices detected"
+    fi
+else
+    fail "john: not found in PATH"
+fi
+
 echo ""
 echo "-- lib symlinks --"
 for lib in libnvidia-ml libcuda; do
@@ -129,6 +171,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
    fi
 done

+if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
+    ok "timer active: bee-selfheal.timer"
+else
+    fail "timer NOT active: bee-selfheal.timer"
+fi
+
 echo ""
 echo "-- runtime health --"
 if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
--- a/iso/overlay/etc/modules-load.d/bee-ipmi.conf
+++ b/iso/overlay/etc/modules-load.d/bee-ipmi.conf
@@ -0,0 +1,3 @@
+# Load IPMI modules for fan/sensor/power monitoring via ipmitool
+ipmi_si
+ipmi_devintf
--- a/iso/overlay/etc/profile.d/bee.sh
+++ b/iso/overlay/etc/profile.d/bee.sh
@@ -1,4 +1,4 @@
-export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
+export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin"

 # Print web UI URLs on the local console at login.
 if [ -z "${SSH_CONNECTION:-}" ] \
--- a/iso/overlay/etc/systemd/system/bee-audit.service
+++ b/iso/overlay/etc/systemd/system/bee-audit.service
@@ -1,14 +1,13 @@
 [Unit]
-Description=Bee: run hardware audit
-After=bee-network.service bee-nvidia.service bee-preflight.service
-Before=bee-web.service
+Description=Bee: hardware audit
+After=bee-preflight.service bee-network.service bee-nvidia.service

 [Service]
 Type=oneshot
-ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
+RemainAfterExit=yes
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /usr/local/bin/bee audit --runtime auto --output file:/appdata/bee/export/bee-audit.json
 StandardOutput=journal
 StandardError=journal
-RemainAfterExit=yes

 [Install]
 WantedBy=multi-user.target
--- a/iso/overlay/etc/systemd/system/bee-selfheal.service
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.service
@@ -0,0 +1,9 @@
+[Unit]
+Description=Bee: periodic runtime self-heal
+After=bee-web.service bee-audit.service bee-preflight.service
+
+[Service]
+Type=oneshot
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
+StandardOutput=journal
+StandardError=journal
--- a/iso/overlay/etc/systemd/system/bee-selfheal.timer
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.timer
@@ -0,0 +1,11 @@
+[Unit]
+Description=Bee: run self-heal checks periodically
+
+[Timer]
+OnBootSec=45sec
+OnUnitActiveSec=60sec
+AccuracySec=15sec
+Unit=bee-selfheal.service
+
+[Install]
+WantedBy=timers.target
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -1,15 +1,18 @@
 [Unit]
 Description=Bee: hardware audit web viewer
-After=bee-network.service bee-audit.service
-Wants=bee-audit.service
+StartLimitIntervalSec=0

 [Service]
 Type=simple
 ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
 Restart=always
-RestartSec=2
+RestartSec=3
 StandardOutput=journal
 StandardError=journal
+LimitMEMLOCK=infinity
+# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
+# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
+Nice=0

 [Install]
 WantedBy=multi-user.target
--- a/iso/overlay/etc/systemd/system/lightdm.service.d/bee-display-mode.conf
+++ b/iso/overlay/etc/systemd/system/lightdm.service.d/bee-display-mode.conf
@@ -0,0 +1,6 @@
+[Unit]
+Wants=bee-preflight.service
+After=bee-preflight.service
+
+[Service]
+ExecStartPre=/usr/local/bin/bee-display-mode
--- a/iso/overlay/etc/systemd/system/lightdm.service.d/bee-limits.conf
+++ b/iso/overlay/etc/systemd/system/lightdm.service.d/bee-limits.conf
@@ -4,3 +4,6 @@
 RestartSec=10
 StartLimitIntervalSec=60
 StartLimitBurst=3
+# Raise scheduling priority of the X server so the graphical console (KVM/IPMI)
+# stays responsive during GPU/CPU stress tests running at nice+10.
+Nice=-5
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`# No GPU variant — no NVIDIA, no AMD/ROCm packages`