Fix benchmark report methodology and rebuild gpu burn worker on toolchain changes

Unify benchmark exports and drop ASCII charts
Add per-precision benchmark phases, weighted TOPS scoring, and ECC tracking
2026-04-13 23:43:12 +03:00 · 2026-04-13 21:38:28 +03:00 · 2026-04-13 10:49:49 +03:00 · 2026-04-12 22:46:42 +03:00 · 2026-04-12 22:36:51 +03:00 · 2026-04-12 22:33:17 +03:00
166 changed files with 28200 additions and 4487 deletions
--- a/PLAN.md
+++ b/PLAN.md
@@ -343,9 +343,9 @@ Planned code shape:
 - `bee tui` can rerun the audit manually
 - `bee tui` can export the latest audit JSON to removable media
 - `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
+- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
 - SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
+- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
 - removable export requires explicit target selection, mount, confirmation, copy, and cleanup

 ### 2.6 — Vendor utilities and optional assets
--- a/audit/Makefile
+++ b/audit/Makefile
@@ -0,0 +1,22 @@
+LISTEN ?= :8080
+AUDIT_PATH ?=
+EXPORT_DIR ?= $(CURDIR)/.tmp/export
+VERSION ?= $(shell sh ./scripts/resolve-version.sh)
+GO_LDFLAGS := -X main.Version=$(VERSION)
+
+RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
+ifneq ($(AUDIT_PATH),)
+RUN_ARGS += --audit-path $(AUDIT_PATH)
+endif
+
+.PHONY: run build test
+
+run:
+	mkdir -p $(EXPORT_DIR)
+	go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
+
+build:
+	go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
+
+test:
+	go test ./...
--- a/audit/bee
+++ b/audit/bee
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -1,30 +1,49 @@
 package main

 import (
+	"context"
 	"flag"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
+	"runtime/debug"
+	"strconv"
 	"strings"

 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/runtimeenv"
-	"bee/audit/internal/tui"
 	"bee/audit/internal/webui"
 )

 var Version = "dev"

+func buildLabel() string {
+	label := strings.TrimSpace(Version)
+	if label == "" {
+		return "dev"
+	}
+	return label
+}
+
 func main() {
 	os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
 }

-func run(args []string, stdout, stderr io.Writer) int {
+func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level: slog.LevelInfo,
 	})))
+	defer func() {
+		if rec := recover(); rec != nil {
+			slog.Error("fatal panic",
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+			exitCode = 1
+		}
+	}()

 	if len(args) == 0 {
 		printRootUsage(stderr)
@@ -40,8 +59,6 @@ func run(args []string, stdout, stderr io.Writer) int {
 		return 0
 	case "audit":
 		return runAudit(args[1:], stdout, stderr)
-	case "tui":
-		return runTUI(args[1:], stdout, stderr)
 	case "export":
 		return runExport(args[1:], stdout, stderr)
 	case "preflight":
@@ -52,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
 		return runWeb(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
+	case "benchmark":
+		return runBenchmark(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -66,11 +85,11 @@ func printRootUsage(w io.Writer) {
 	fmt.Fprintln(w, `bee commands:
  bee audit   --runtime auto|local|livecd --output stdout|file:<path>
  bee preflight --output stdout|file:<path>
-  bee tui     --runtime auto|local|livecd
  bee export  --target <device>
  bee support-bundle --output stdout|file:<path>
-  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
+  bee web     --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
+  bee benchmark nvidia [--profile standard|stability|overnight]
  bee version
  bee help [command]`)
 }
@@ -79,8 +98,6 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 	switch args[0] {
 	case "audit":
 		return runAudit([]string{"--help"}, stdout, stdout)
-	case "tui":
-		return runTUI([]string{"--help"}, stdout, stdout)
 	case "export":
 		return runExport([]string{"--help"}, stdout, stdout)
 	case "preflight":
@@ -91,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
+	case "benchmark":
+		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -145,43 +164,6 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

-func runTUI(args []string, stdout, stderr io.Writer) int {
-	fs := flag.NewFlagSet("tui", flag.ContinueOnError)
-	fs.SetOutput(stderr)
-	runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
-	fs.Usage = func() {
-		fmt.Fprintln(stderr, "usage: bee tui [--runtime auto|local|livecd]")
-		fs.PrintDefaults()
-	}
-	if err := fs.Parse(args); err != nil {
-		if err == flag.ErrHelp {
-			return 0
-		}
-		return 2
-	}
-	if fs.NArg() != 0 {
-		fs.Usage()
-		return 2
-	}
-
-	runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
-	if err != nil {
-		slog.Error("resolve runtime", "err", err)
-		return 1
-	}
-
-	slog.SetDefault(slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{
-		Level: slog.LevelInfo,
-	})))
-
-	application := app.New(platform.New())
-	if err := tui.Run(application, runtimeInfo.Mode); err != nil {
-		slog.Error("run tui", "err", err)
-		return 1
-	}
-	return 0
-}
-
 func runExport(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("export", flag.ContinueOnError)
 	fs.SetOutput(stderr)
@@ -314,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("web", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
-	auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
+	auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
 	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
 	title := fs.String("title", "Bee Hardware Audit", "page title")
 	fs.Usage = func() {
@@ -333,10 +315,19 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	}

 	slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
+
+	runtimeInfo, err := runtimeenv.Detect("auto")
+	if err != nil {
+		slog.Warn("resolve runtime for web", "err", err)
+	}
+
 	if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
-		Title:     *title,
-		AuditPath: *auditPath,
-		ExportDir: *exportDir,
+		Title:       *title,
+		BuildLabel:  buildLabel(),
+		AuditPath:   *auditPath,
+		ExportDir:   *exportDir,
+		App:         app.New(platform.New()),
+		RuntimeMode: runtimeInfo.Mode,
 	}); err != nil {
 		slog.Error("run web", "err", err)
 		return 1
@@ -357,6 +348,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("sat", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
+	diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
 	if err := fs.Parse(args[1:]); err != nil {
 		if err == flag.ErrHelp {
 			return 0
@@ -371,7 +363,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	target := args[0]
 	if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
 		fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
-		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
+		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
 		return 2
 	}

@@ -380,19 +372,25 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 		archive string
 		err     error
 	)
+	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
 	switch target {
 	case "nvidia":
-		archive, err = application.RunNvidiaAcceptancePack("")
+		level := *diagLevel
+		if level > 0 {
+			_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
+		} else {
+			archive, err = application.RunNvidiaAcceptancePack("", logLine)
+		}
 	case "memory":
-		archive, err = application.RunMemoryAcceptancePack("")
+		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
 	case "storage":
-		archive, err = application.RunStorageAcceptancePack("")
+		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
 	case "cpu":
 		dur := *duration
 		if dur <= 0 {
 			dur = 60
 		}
-		archive, err = application.RunCPUAcceptancePack("", dur)
+		archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
 	}
 	if err != nil {
 		slog.Error("run sat", "target", target, "err", err)
@@ -401,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
+
+func runBenchmark(args []string, stdout, stderr io.Writer) int {
+	if len(args) == 0 {
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
+		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 0
+	}
+	target := args[0]
+	if target != "nvidia" {
+		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+
+	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
+	devices := fs.String("devices", "", "comma-separated GPU indices to include")
+	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
+	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
+	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
+	if err := fs.Parse(args[1:]); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
+		return 2
+	}
+
+	includeIndices, err := parseBenchmarkIndexCSV(*devices)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
+		return 2
+	}
+	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
+		return 2
+	}
+
+	application := app.New(platform.New())
+	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
+	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
+		Profile:           *profile,
+		SizeMB:            *sizeMB,
+		GPUIndices:        includeIndices,
+		ExcludeGPUIndices: excludeIndices,
+		RunNCCL:           !*skipNCCL,
+	}, logLine)
+	if err != nil {
+		slog.Error("run benchmark", "target", target, "err", err)
+		return 1
+	}
+	slog.Info("benchmark archive written", "target", target, "path", archive)
+	return 0
+}
+
+func parseBenchmarkIndexCSV(raw string) ([]int, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil, nil
+	}
+	var indices []int
+	for _, part := range strings.Split(raw, ",") {
+		part = strings.TrimSpace(part)
+		if part == "" {
+			continue
+		}
+		value, err := strconv.Atoi(part)
+		if err != nil || value < 0 {
+			return nil, fmt.Errorf("bad gpu index %q", part)
+		}
+		indices = append(indices, value)
+	}
+	return indices, nil
+}
--- a/audit/cmd/bee/main_test.go
+++ b/audit/cmd/bee/main_test.go
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
 }

 func TestRunVersion(t *testing.T) {
-	t.Parallel()
-
 	old := Version
 	Version = "test-version"
 	t.Cleanup(func() { Version = old })
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
 	}
 }

+func TestBuildLabelUsesVersionAsIs(t *testing.T) {
+	old := Version
+	Version = "1.2.3"
+	t.Cleanup(func() { Version = old })
+
+	if got := buildLabel(); got != "1.2.3" {
+		t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
+	}
+}
+
 func TestRunExportRequiresTarget(t *testing.T) {
 	t.Parallel()

--- a/audit/go.mod
+++ b/audit/go.mod
@@ -1,28 +1,26 @@
 module bee/audit

-go 1.24.0
+go 1.25.0

 replace reanimator/chart => ../internal/chart

-require github.com/charmbracelet/bubbletea v1.3.4
-require github.com/charmbracelet/lipgloss v1.0.0
-require reanimator/chart v0.0.0
+require (
+	github.com/go-analyze/charts v0.5.26
+	reanimator/chart v0.0.0-00010101000000-000000000000
+)

 require (
-	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
-	github.com/charmbracelet/lipgloss v1.0.0 // promoted to direct — used for TUI colors
-	github.com/charmbracelet/x/ansi v0.8.0 // indirect
-	github.com/charmbracelet/x/term v0.2.1 // indirect
-	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
-	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
+	github.com/dustin/go-humanize v1.0.1 // indirect
+	github.com/go-analyze/bulk v0.1.3 // indirect
+	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
+	github.com/google/uuid v1.6.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
-	github.com/mattn/go-localereader v0.0.1 // indirect
-	github.com/mattn/go-runewidth v0.0.16 // indirect
-	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
-	github.com/muesli/cancelreader v0.2.2 // indirect
-	github.com/muesli/termenv v0.15.2 // indirect
-	github.com/rivo/uniseg v0.4.7 // indirect
-	golang.org/x/sync v0.11.0 // indirect
-	golang.org/x/sys v0.30.0 // indirect
-	golang.org/x/text v0.3.8 // indirect
+	github.com/ncruces/go-strftime v1.0.0 // indirect
+	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
+	golang.org/x/image v0.24.0 // indirect
+	golang.org/x/sys v0.42.0 // indirect
+	modernc.org/libc v1.70.0 // indirect
+	modernc.org/mathutil v1.7.1 // indirect
+	modernc.org/memory v1.11.0 // indirect
+	modernc.org/sqlite v1.48.0 // indirect
 )
--- a/audit/go.sum
+++ b/audit/go.sum
@@ -1,37 +1,37 @@
-github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
-github.com/charmbracelet/bubbletea v1.3.4 h1:kCg7B+jSCFPLYRA52SDZjr51kG/fMUEoPoZrkaDHyoI=
-github.com/charmbracelet/bubbletea v1.3.4/go.mod h1:dtcUCyCGEX3g9tosuYiut3MXgY/Jsv9nKVdibKKRRXo=
-github.com/charmbracelet/lipgloss v1.0.0 h1:O7VkGDvqEdGi93X+DeqsQ7PKHDgtQfF8j8/O2qFMQNg=
-github.com/charmbracelet/lipgloss v1.0.0/go.mod h1:U5fy9Z+C38obMs+T+tJqst9VGzlOYGj4ri9reL3qUlo=
-github.com/charmbracelet/x/ansi v0.8.0 h1:9GTq3xq9caJW8ZrBTe0LIe2fvfLR/bYXKTx2llXn7xE=
-github.com/charmbracelet/x/ansi v0.8.0/go.mod h1:wdYl/ONOLHLIVmQaxbIYEC/cRKOQyjTkowiI4blgS9Q=
-github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
-github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
-github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
-github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
-github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
-github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
+github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
+github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
+github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
+github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
-github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
-github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
-github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
-github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
-github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
-github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
-github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
-github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo=
-github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8=
-github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
-github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
-github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
-golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
-golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
-golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
+github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
+github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
+golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
-golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY=
-golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
+golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
+golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
+modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
+modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
+modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
+modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
+modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
+modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,26 +19,30 @@ import (
 )

 var (
-	DefaultExportDir       = "/appdata/bee/export"
-	DefaultAuditJSONPath   = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath    = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath      = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath  = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath   = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath      = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath  = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir     = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir      = DefaultExportDir + "/bee-sat"
+	DefaultExportDir        = "/appdata/bee/export"
+	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
+	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
+	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
+	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
+	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
+	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
+	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
+	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
 )

 type App struct {
-	network  networkManager
-	services serviceManager
-	exports  exportManager
-	tools    toolManager
-	sat      satRunner
-	runtime  runtimeChecker
+	network   networkManager
+	services  serviceManager
+	exports   exportManager
+	tools     toolManager
+	sat       satRunner
+	runtime   runtimeChecker
+	installer installer
+	// StatusDB is the unified component health store (nil if unavailable).
+	StatusDB *ComponentStatusDB
 }

 type ActionResult struct {
@@ -52,10 +56,15 @@ type networkManager interface {
 	DHCPOne(iface string) (string, error)
 	DHCPAll() (string, error)
 	SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
+	SetInterfaceState(iface string, up bool) error
+	GetInterfaceState(iface string) (bool, error)
+	CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
+	RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
 }

 type serviceManager interface {
 	ListBeeServices() ([]string, error)
+	ServiceState(name string) string
 	ServiceStatus(name string) (string, error)
 	ServiceDo(name string, action platform.ServiceAction) (string, error)
 }
@@ -70,17 +79,66 @@ type toolManager interface {
 	CheckTools(names []string) []platform.ToolStatus
 }

+type installer interface {
+	ListInstallDisks() ([]platform.InstallDisk, error)
+	InstallToDisk(ctx context.Context, device string, logFile string) error
+	IsLiveMediaInRAM() bool
+	LiveBootSource() platform.LiveBootSource
+	RunInstallToRAM(ctx context.Context, logFunc func(string)) error
+}
+
+type GPUPresenceResult struct {
+	Nvidia bool
+	AMD    bool
+}
+
+func (a *App) DetectGPUPresence() GPUPresenceResult {
+	vendor := a.sat.DetectGPUVendor()
+	return GPUPresenceResult{
+		Nvidia: vendor == "nvidia",
+		AMD:    vendor == "amd",
+	}
+}
+
+func (a *App) IsLiveMediaInRAM() bool {
+	return a.installer.IsLiveMediaInRAM()
+}
+
+func (a *App) LiveBootSource() platform.LiveBootSource {
+	return a.installer.LiveBootSource()
+}
+
+func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
+	return a.installer.RunInstallToRAM(ctx, logFunc)
+}
+
 type satRunner interface {
-	RunNvidiaAcceptancePack(baseDir string) (string, error)
-	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error)
-	RunMemoryAcceptancePack(baseDir string) (string, error)
-	RunStorageAcceptancePack(baseDir string) (string, error)
-	RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
+	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
+	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
+	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
+	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
+	ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
+	ResetNvidiaGPU(index int) (string, error)
+	RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
+	RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
+	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
 	DetectGPUVendor() string
 	ListAMDGPUs() ([]platform.AMDGPUInfo, error)
-	RunAMDAcceptancePack(baseDir string) (string, error)
+	RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
+	RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
+	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
+	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 }

 type runtimeChecker interface {
@@ -89,14 +147,39 @@ type runtimeChecker interface {
 }

 func New(platform *platform.System) *App {
-	return &App{
-		network:  platform,
-		services: platform,
-		exports:  platform,
-		tools:    platform,
-		sat:      platform,
-		runtime:  platform,
+	a := &App{
+		network:   platform,
+		services:  platform,
+		exports:   platform,
+		tools:     platform,
+		sat:       platform,
+		runtime:   platform,
+		installer: platform,
 	}
+	if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
+		a.StatusDB = db
+	}
+	return a
+}
+
+// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
+// and returns the updated JSON. Used by the web UI to serve always-fresh status.
+func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
+	snap, err := readAuditSnapshot(auditJSON)
+	if err != nil {
+		return nil, err
+	}
+	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
+	return json.MarshalIndent(snap, "", "  ")
+}
+
+func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
+	var snap schema.HardwareIngestRequest
+	if err := json.Unmarshal(auditJSON, &snap); err != nil {
+		return schema.HardwareIngestRequest{}, err
+	}
+	collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
+	return snap, nil
 }

 func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
@@ -106,7 +189,8 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		}
 	}
 	result := collector.Run(runtimeMode)
-	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
+	writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -121,10 +205,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
-			return "", err
-		}
-		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -149,10 +230,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
-			return "", err
-		}
-		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -222,6 +300,9 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 	if err != nil {
 		return "", err
 	}
+	if normalized, normErr := ApplySATOverlay(data); normErr == nil {
+		data = normalized
+	}
 	if err := os.WriteFile(tmpPath, data, 0644); err != nil {
 		return "", err
 	}
@@ -231,8 +312,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)

 func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
 	path, err := a.ExportLatestAudit(target)
-	body := "Audit exported."
-	if path != "" {
+	body := "Audit export failed."
+	if err == nil {
+		body = "Audit exported."
+	}
+	if err == nil && path != "" {
 		body = "Audit exported to " + path
 	}
 	return ActionResult{Title: "Export audit", Body: body}, err
@@ -249,8 +333,11 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro

 func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
 	path, err := a.ExportSupportBundle(target)
-	body := "Support bundle exported. USB target unmounted and safe to remove."
-	if path != "" {
+	body := "Support bundle export failed."
+	if err == nil {
+		body = "Support bundle exported. USB target unmounted and safe to remove."
+	}
+	if err == nil && path != "" {
 		body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
 	}
 	return ActionResult{Title: "Export support bundle", Body: body}, err
@@ -286,6 +373,22 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
 	return a.network.SetStaticIPv4(cfg)
 }

+func (a *App) SetInterfaceState(iface string, up bool) error {
+	return a.network.SetInterfaceState(iface, up)
+}
+
+func (a *App) GetInterfaceState(iface string) (bool, error) {
+	return a.network.GetInterfaceState(iface)
+}
+
+func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
+	return a.network.CaptureNetworkSnapshot()
+}
+
+func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
+	return a.network.RestoreNetworkSnapshot(snapshot)
+}
+
 func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
 	body, err := a.network.SetStaticIPv4(cfg)
 	return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
@@ -342,6 +445,10 @@ func (a *App) ListBeeServices() ([]string, error) {
 	return a.services.ListBeeServices()
 }

+func (a *App) ServiceState(name string) string {
+	return a.services.ServiceState(name)
+}
+
 func (a *App) ServiceStatus(name string) (string, error) {
 	return a.services.ServiceStatus(name)
 }
@@ -397,15 +504,15 @@ func (a *App) AuditLogTailResult() ActionResult {
 	return ActionResult{Title: "Audit log tail", Body: body}
 }

-func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
+func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunNvidiaAcceptancePack(baseDir)
+	return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
 }

 func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
-	path, err := a.RunNvidiaAcceptancePack(baseDir)
+	path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
 	body := "Archive written."
 	if path != "" {
 		body = "Archive written to " + path
@@ -417,58 +524,129 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return a.sat.ListNvidiaGPUs()
 }

-func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (ActionResult, error) {
+func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
+	return a.sat.ListNvidiaGPUStatuses()
+}
+
+func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
+	out, err := a.sat.ResetNvidiaGPU(index)
+	return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
+}
+
+func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, durationSec, sizeMB, gpuIndices)
+	path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
 	body := "Archive written."
 	if path != "" {
 		body = "Archive written to " + path
 	}
-	// Include terminal chart if available (runDir = archive path without .tar.gz).
-	if path != "" {
-		termPath := filepath.Join(strings.TrimSuffix(path, ".tar.gz"), "gpu-metrics-term.txt")
-		if chart, readErr := os.ReadFile(termPath); readErr == nil && len(chart) > 0 {
-			body += "\n\n" + string(chart)
-		}
-	}
-	return ActionResult{Title: "NVIDIA SAT", Body: body}, err
+	return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
 }

-func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
+func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunMemoryAcceptancePack(baseDir)
+	return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBenchmarkBaseDir
+	}
+	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
+}
+
+func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
+}
+
+func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
 }

 func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
-	path, err := a.RunMemoryAcceptancePack(baseDir)
+	path, err := a.RunMemoryAcceptancePack(baseDir, nil)
 	return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
 }

-func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
+func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunCPUAcceptancePack(baseDir, durationSec)
+	return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
 }

 func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
-	path, err := a.RunCPUAcceptancePack(baseDir, durationSec)
+	path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
 	return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
 }

-func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
+func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
+}
+
+func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunStorageAcceptancePack(baseDir)
+	return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
 }

 func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
-	path, err := a.RunStorageAcceptancePack(baseDir)
+	path, err := a.RunStorageAcceptancePack(baseDir, nil)
 	return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
 }

@@ -480,18 +658,63 @@ func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
 	return a.sat.ListAMDGPUs()
 }

-func (a *App) RunAMDAcceptancePack(baseDir string) (string, error) {
+func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
+}
+
+func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunAMDAcceptancePack(baseDir)
+	return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
 }

 func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
-	path, err := a.RunAMDAcceptancePack(baseDir)
+	path, err := a.RunAMDAcceptancePack(baseDir, nil)
 	return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
 }

+func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
+func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
+}
+
 func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -499,6 +722,22 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
 	return a.sat.RunFanStressTest(ctx, baseDir, opts)
 }

+func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
+	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	body := "Results: " + path
+	if err != nil && err != context.Canceled {
+		body += "\nERROR: " + err.Error()
+	}
+	return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
+}
+
 func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
 	path, err := a.RunFanStressTest(ctx, "", opts)
 	body := formatFanStressResult(path)
@@ -576,6 +815,7 @@ func (a *App) HealthSummaryResult() ActionResult {
 	if err := json.Unmarshal(raw, &snapshot); err != nil {
 		return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
 	}
+	collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)

 	summary := collector.BuildHealthSummary(snapshot.Hardware)
 	var body strings.Builder
@@ -610,6 +850,7 @@ func (a *App) MainBanner() string {
 	if err := json.Unmarshal(raw, &snapshot); err != nil {
 		return ""
 	}
+	collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)

 	var lines []string
 	if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
@@ -686,6 +927,41 @@ func bodyOr(body, fallback string) string {
 	return body
 }

+// writePSUStatusesToDB records PSU statuses collected during audit into the
+// component-status DB so they are visible in the Hardware Summary card.
+// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
+func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
+	if db == nil || len(psus) == 0 {
+		return
+	}
+	const source = "audit:ipmi"
+	worstStatus := "OK"
+	for _, psu := range psus {
+		if psu.Status == nil {
+			continue
+		}
+		slot := "?"
+		if psu.Slot != nil {
+			slot = *psu.Slot
+		}
+		st := *psu.Status
+		detail := ""
+		if psu.ErrorDescription != nil {
+			detail = *psu.ErrorDescription
+		}
+		db.Record("psu:"+slot, source, st, detail)
+		switch st {
+		case "Critical":
+			worstStatus = "Critical"
+		case "Warning":
+			if worstStatus != "Critical" {
+				worstStatus = "Warning"
+			}
+		}
+	}
+	db.Record("psu:all", source, worstStatus, "")
+}
+
 func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
@@ -704,6 +980,12 @@ func latestSATSummaries() []string {
 		prefix string
 	}{
 		{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
+		{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
+		{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
+		{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
+		{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
+		{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
+		{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
 		{label: "Memory SAT", prefix: "memory-"},
 		{label: "Storage SAT", prefix: "storage-"},
 		{label: "CPU SAT", prefix: "cpu-"},
@@ -994,3 +1276,70 @@ func firstNonEmpty(values ...string) string {
 	}
 	return ""
 }
+
+func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
+	return a.installer.ListInstallDisks()
+}
+
+func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
+	return a.installer.InstallToDisk(ctx, device, logFile)
+}
+
+func formatSATDetail(raw string) string {
+	var b strings.Builder
+	kv := parseKeyValueSummary(raw)
+
+	if t, ok := kv["run_at_utc"]; ok {
+		fmt.Fprintf(&b, "Run: %s\n\n", t)
+	}
+
+	lines := strings.Split(raw, "\n")
+	var stepKeys []string
+	seenStep := map[string]bool{}
+	for _, line := range lines {
+		if idx := strings.Index(line, "_status="); idx >= 0 {
+			key := line[:idx]
+			if !seenStep[key] && key != "overall" {
+				seenStep[key] = true
+				stepKeys = append(stepKeys, key)
+			}
+		}
+	}
+
+	for _, key := range stepKeys {
+		status := kv[key+"_status"]
+		display := cleanSummaryKey(key)
+		switch status {
+		case "OK":
+			fmt.Fprintf(&b, "PASS  %s\n", display)
+		case "FAILED":
+			fmt.Fprintf(&b, "FAIL  %s\n", display)
+		case "UNSUPPORTED":
+			fmt.Fprintf(&b, "SKIP  %s\n", display)
+		default:
+			fmt.Fprintf(&b, "?     %s\n", display)
+		}
+	}
+
+	if overall, ok := kv["overall_status"]; ok {
+		ok2 := kv["job_ok"]
+		failed := kv["job_failed"]
+		fmt.Fprintf(&b, "\nOverall: %s  (ok=%s  failed=%s)", overall, ok2, failed)
+	}
+
+	return strings.TrimSpace(b.String())
+}
+
+func cleanSummaryKey(key string) string {
+	idx := strings.Index(key, "-")
+	if idx <= 0 {
+		return key
+	}
+	prefix := key[:idx]
+	for _, c := range prefix {
+		if c < '0' || c > '9' {
+			return key
+		}
+	}
+	return key[idx+1:]
+}
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -43,6 +43,13 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
 	return f.setStaticIPv4Fn(cfg)
 }

+func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
+func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
+func (f fakeNetwork) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
+	return platform.NetworkSnapshot{}, nil
+}
+func (f fakeNetwork) RestoreNetworkSnapshot(platform.NetworkSnapshot) error { return nil }
+
 type fakeServices struct {
 	serviceStatusFn func(string) (string, error)
 	serviceDoFn     func(string, platform.ServiceAction) (string, error)
@@ -52,6 +59,10 @@ func (f fakeServices) ListBeeServices() ([]string, error) {
 	return nil, nil
 }

+func (f fakeServices) ServiceState(name string) string {
+	return "active"
+}
+
 func (f fakeServices) ServiceStatus(name string) (string, error) {
 	return f.serviceStatusFn(name)
 }
@@ -109,21 +120,79 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }

 type fakeSAT struct {
-	runNvidiaFn      func(string) (string, error)
-	runMemoryFn      func(string) (string, error)
-	runStorageFn     func(string) (string, error)
-	runCPUFn         func(string, int) (string, error)
-	detectVendorFn   func() string
-	listAMDGPUsFn    func() ([]platform.AMDGPUInfo, error)
-	runAMDPackFn     func(string) (string, error)
-	listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
+	runNvidiaFn               func(string) (string, error)
+	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
+	runNvidiaComputeFn        func(string, int, []int) (string, error)
+	runNvidiaPowerFn          func(string, int, []int) (string, error)
+	runNvidiaPulseFn          func(string, int, []int) (string, error)
+	runNvidiaBandwidthFn      func(string, []int) (string, error)
+	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
+	runMemoryFn               func(string) (string, error)
+	runStorageFn              func(string) (string, error)
+	runCPUFn                  func(string, int) (string, error)
+	detectVendorFn            func() string
+	listAMDGPUsFn             func() ([]platform.AMDGPUInfo, error)
+	runAMDPackFn              func(string) (string, error)
+	listNvidiaGPUsFn          func() ([]platform.NvidiaGPU, error)
+	listNvidiaGPUStatusesFn   func() ([]platform.NvidiaGPUStatus, error)
+	resetNvidiaGPUFn          func(int) (string, error)
 }

-func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
+func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
 	return f.runNvidiaFn(baseDir)
 }

-func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ int, _ []int) (string, error) {
+func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) {
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
+	if f.runNvidiaBenchmarkFn != nil {
+		return f.runNvidiaBenchmarkFn(baseDir, opts)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaTargetedStressFn != nil {
+		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
+	if f.runNvidiaComputeFn != nil {
+		return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaPowerFn != nil {
+		return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaPulseFn != nil {
+		return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaBandwidthFn != nil {
+		return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
+	if f.runNvidiaStressFn != nil {
+		return f.runNvidiaStressFn(baseDir, opts)
+	}
 	return f.runNvidiaFn(baseDir)
 }

@@ -134,15 +203,29 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return nil, nil
 }

-func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
+func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
+	if f.listNvidiaGPUStatusesFn != nil {
+		return f.listNvidiaGPUStatusesFn()
+	}
+	return nil, nil
+}
+
+func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
+	if f.resetNvidiaGPUFn != nil {
+		return f.resetNvidiaGPUFn(index)
+	}
+	return "", nil
+}
+
+func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
 	return f.runMemoryFn(baseDir)
 }

-func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
+func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
 	return f.runStorageFn(baseDir)
 }

-func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
+func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
 	if f.runCPUFn != nil {
 		return f.runCPUFn(baseDir, durationSec)
 	}
@@ -163,17 +246,43 @@ func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
 	return nil, nil
 }

-func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
+func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
 	if f.runAMDPackFn != nil {
 		return f.runAMDPackFn(baseDir)
 	}
 	return "", nil
 }

+func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
+	return "", nil
+}
+
+func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
+	return "", nil
+}
+
+func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
+	return "", nil
+}
+func (f fakeSAT) RunMemoryStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
+	return "", nil
+}
+func (f fakeSAT) RunSATStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
+	return "", nil
+}
+
 func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
 	return "", nil
 }

+func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
+	return "", nil
+}
+
+func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+	return "", nil
+}
+
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()

@@ -433,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
 }

 func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -470,6 +577,39 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 	}
 }

+func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
+	tmp := t.TempDir()
+	oldExportDir := DefaultExportDir
+	DefaultExportDir = tmp
+	t.Cleanup(func() { DefaultExportDir = oldExportDir })
+
+	if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
+		t.Fatalf("write bee-audit.json: %v", err)
+	}
+	if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
+		t.Fatalf("write bee-audit.log: %v", err)
+	}
+
+	a := &App{
+		exports: fakeExports{
+			exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
+				return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
+			},
+		},
+	}
+
+	result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
+	if err == nil {
+		t.Fatal("expected export error")
+	}
+	if contains(result.Body, "exported to") {
+		t.Fatalf("body should not claim success:\n%s", result.Body)
+	}
+	if result.Body != "Support bundle export failed." {
+		t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
+	}
+}
+
 func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 	t.Parallel()

@@ -499,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 }

 func TestRunSATDefaultsToExportDir(t *testing.T) {
-	t.Parallel()
-
 	oldSATBaseDir := DefaultSATBaseDir
 	DefaultSATBaseDir = "/tmp/export/bee-sat"
 	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
@@ -531,13 +669,13 @@ func TestRunSATDefaultsToExportDir(t *testing.T) {
 		},
 	}

-	if _, err := a.RunNvidiaAcceptancePack(""); err != nil {
+	if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil {
 		t.Fatal(err)
 	}
-	if _, err := a.RunMemoryAcceptancePack(""); err != nil {
+	if _, err := a.RunMemoryAcceptancePack("", nil); err != nil {
 		t.Fatal(err)
 	}
-	if _, err := a.RunStorageAcceptancePack(""); err != nil {
+	if _, err := a.RunStorageAcceptancePack("", nil); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -580,13 +718,50 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
 	}
 }

+func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
+	tmp := t.TempDir()
+	oldSATBaseDir := DefaultSATBaseDir
+	DefaultSATBaseDir = filepath.Join(tmp, "sat")
+	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
+
+	raw := `{
+	  "collected_at": "2026-03-15T10:00:00Z",
+	  "hardware": {
+	    "board": {"serial_number": "SRV123"},
+	    "storage": [
+	      {"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
+	      {"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
+	    ],
+	    "pcie_devices": [
+	      {"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
+	      {"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
+	    ]
+	  }
+	}`
+
+	got, err := ApplySATOverlay([]byte(raw))
+	if err != nil {
+		t.Fatalf("ApplySATOverlay error: %v", err)
+	}
+	text := string(got)
+	if contains(text, "Virtual HDisk0") {
+		t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
+	}
+	if contains(text, "\"device_class\": \"Co-processor\"") {
+		t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
+	}
+	if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
+		t.Fatalf("overlaid audit should keep real devices:\n%s", text)
+	}
+}
+
 func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tmp := t.TempDir()
 	exportDir := filepath.Join(tmp, "export")
 	if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
 		t.Fatal(err)
 	}
-	if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
+	if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
@@ -618,6 +793,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {

 	tr := tar.NewReader(gzr)
 	var names []string
+	var auditJSON string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -627,6 +803,36 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			t.Fatalf("read tar entry: %v", err)
 		}
 		names = append(names, hdr.Name)
+		if contains(hdr.Name, "/export/bee-audit.json") {
+			body, err := io.ReadAll(tr)
+			if err != nil {
+				t.Fatalf("read audit entry: %v", err)
+			}
+			auditJSON = string(body)
+		}
+	}
+
+	for _, want := range []string{
+		"/system/ip-link.txt",
+		"/system/ip-link-stats.txt",
+		"/system/kernel-aer-nvidia.txt",
+		"/system/lspci-nvidia-bridges-vv.txt",
+		"/system/pcie-aer-sysfs.txt",
+		"/system/ethtool-info.txt",
+		"/system/ethtool-link.txt",
+		"/system/ethtool-module.txt",
+		"/system/mstflint-query.txt",
+	} {
+		var found bool
+		for _, name := range names {
+			if contains(name, want) {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("support bundle missing %s, names=%v", want, names)
+		}
 	}

 	var foundRaw bool
@@ -641,6 +847,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !foundRaw {
 		t.Fatalf("support bundle missing raw SAT log, names=%v", names)
 	}
+	if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
+		t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
+	}
+	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
+		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
+	}
 }

 func TestMainBanner(t *testing.T) {
@@ -654,6 +866,10 @@ func TestMainBanner(t *testing.T) {
 	product := "PowerEdge R760"
 	cpuModel := "Intel Xeon Gold 6430"
 	memoryType := "DDR5"
+	memorySerialA := "DIMM-A"
+	memorySerialB := "DIMM-B"
+	storageSerialA := "DISK-A"
+	storageSerialB := "DISK-B"
 	gpuClass := "VideoController"
 	gpuModel := "NVIDIA H100"

@@ -669,12 +885,12 @@ func TestMainBanner(t *testing.T) {
 				{Model: &cpuModel},
 			},
 			Memory: []schema.HardwareMemory{
-				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
-				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
+				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
+				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
 			},
 			Storage: []schema.HardwareStorage{
-				{Present: &trueValue, SizeGB: intPtr(3840)},
-				{Present: &trueValue, SizeGB: intPtr(3840)},
+				{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
+				{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
 			},
 			PCIeDevices: []schema.HardwarePCIeDevice{
 				{DeviceClass: &gpuClass, Model: &gpuModel},
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -0,0 +1,48 @@
+package app
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
+	}
+
+	tmpPath := path + ".tmp"
+	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
+	if err != nil {
+		return fmt.Errorf("open temp %s: %w", tmpPath, err)
+	}
+
+	success := false
+	defer func() {
+		_ = f.Close()
+		if !success {
+			_ = os.Remove(tmpPath)
+		}
+	}()
+
+	if _, err := f.Write(data); err != nil {
+		return fmt.Errorf("write temp %s: %w", tmpPath, err)
+	}
+	if err := f.Sync(); err != nil {
+		return fmt.Errorf("sync temp %s: %w", tmpPath, err)
+	}
+	if err := f.Close(); err != nil {
+		return fmt.Errorf("close temp %s: %w", tmpPath, err)
+	}
+	if err := os.Rename(tmpPath, path); err != nil {
+		return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
+	}
+
+	if dir, err := os.Open(filepath.Dir(path)); err == nil {
+		_ = dir.Sync()
+		_ = dir.Close()
+	}
+
+	success = true
+	return nil
+}
--- a/audit/internal/app/atomic_write_test.go
+++ b/audit/internal/app/atomic_write_test.go
@@ -0,0 +1,71 @@
+package app
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"bee/audit/internal/schema"
+)
+
+func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "bee-audit.json")
+	if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
+		t.Fatalf("seed file: %v", err)
+	}
+
+	if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
+		t.Fatalf("atomicWriteFile: %v", err)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read final: %v", err)
+	}
+	if string(raw) != "new\n" {
+		t.Fatalf("final content=%q want %q", string(raw), "new\n")
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Fatalf("tmp file should be absent after success, err=%v", err)
+	}
+}
+
+func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "runtime-health.json")
+	a := &App{
+		runtime: fakeRuntime{
+			collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
+				return schema.RuntimeHealth{
+					Status:      "OK",
+					ExportDir:   exportDir,
+					DriverReady: true,
+					CUDAReady:   true,
+				}, nil
+			},
+		},
+	}
+
+	got, err := a.RunRuntimePreflight("file:" + path)
+	if err != nil {
+		t.Fatalf("RunRuntimePreflight: %v", err)
+	}
+	if got != path {
+		t.Fatalf("path=%q want %q", got, path)
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Fatalf("tmp file should be absent after success, err=%v", err)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read runtime file: %v", err)
+	}
+	var health schema.RuntimeHealth
+	if err := json.Unmarshal(raw, &health); err != nil {
+		t.Fatalf("json unmarshal: %v", err)
+	}
+	if health.Status != "OK" {
+		t.Fatalf("status=%q want OK", health.Status)
+	}
+}
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -0,0 +1,268 @@
+package app
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+)
+
+// ComponentStatusDB is a persistent, append-only store of hardware component health records.
+// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
+// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
+// the component stays at the highest observed severity until explicitly reset.
+type ComponentStatusDB struct {
+	path    string
+	mu      sync.Mutex
+	records map[string]*ComponentStatusRecord
+}
+
+// ComponentStatusRecord holds the current and historical health of one hardware component.
+type ComponentStatusRecord struct {
+	ComponentKey  string                 `json:"component_key"`
+	Status        string                 `json:"status"` // "OK", "Warning", "Critical", "Unknown"
+	LastCheckedAt time.Time              `json:"last_checked_at"`
+	LastChangedAt time.Time              `json:"last_changed_at"`
+	ErrorSummary  string                 `json:"error_summary,omitempty"`
+	History       []ComponentStatusEntry `json:"history"`
+}
+
+// ComponentStatusEntry is one observation written to a component's history.
+type ComponentStatusEntry struct {
+	At     time.Time `json:"at"`
+	Status string    `json:"status"`
+	Source string    `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
+	Detail string    `json:"detail,omitempty"`
+}
+
+// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
+func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
+	db := &ComponentStatusDB{
+		path:    path,
+		records: make(map[string]*ComponentStatusRecord),
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return nil, err
+	}
+	data, err := os.ReadFile(path)
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+	if len(data) > 0 {
+		var records []ComponentStatusRecord
+		if err := json.Unmarshal(data, &records); err == nil {
+			for i := range records {
+				db.records[records[i].ComponentKey] = &records[i]
+			}
+		}
+	}
+	return db, nil
+}
+
+// Record writes one observation for the given component key.
+// source is a short label like "sat:nvidia" or "watchdog:kmsg".
+// status is "OK", "Warning", "Critical", or "Unknown".
+// OK never downgrades an existing Warning or Critical status.
+func (db *ComponentStatusDB) Record(key, source, status, detail string) {
+	if db == nil || strings.TrimSpace(key) == "" {
+		return
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+
+	now := time.Now().UTC()
+	rec, exists := db.records[key]
+	if !exists {
+		rec = &ComponentStatusRecord{ComponentKey: key}
+		db.records[key] = rec
+	}
+	rec.LastCheckedAt = now
+
+	entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
+	rec.History = append(rec.History, entry)
+
+	// Status merge: OK never downgrades Warning/Critical.
+	newSev := componentSeverity(status)
+	curSev := componentSeverity(rec.Status)
+	if newSev > curSev {
+		rec.Status = status
+		rec.LastChangedAt = now
+		rec.ErrorSummary = detail
+	} else if rec.Status == "" {
+		rec.Status = status
+		rec.LastChangedAt = now
+	}
+
+	_ = db.saveLocked()
+}
+
+// Get returns the current record for a component key.
+func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
+	if db == nil {
+		return ComponentStatusRecord{}, false
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+	r, ok := db.records[key]
+	if !ok {
+		return ComponentStatusRecord{}, false
+	}
+	return *r, true
+}
+
+// All returns a snapshot of all records.
+func (db *ComponentStatusDB) All() []ComponentStatusRecord {
+	if db == nil {
+		return nil
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+	out := make([]ComponentStatusRecord, 0, len(db.records))
+	for _, r := range db.records {
+		out = append(out, *r)
+	}
+	return out
+}
+
+func (db *ComponentStatusDB) saveLocked() error {
+	records := make([]ComponentStatusRecord, 0, len(db.records))
+	for _, r := range db.records {
+		records = append(records, *r)
+	}
+	data, err := json.MarshalIndent(records, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(db.path, data, 0644)
+}
+
+// componentSeverity returns a numeric severity so higher values win.
+func componentSeverity(status string) int {
+	switch strings.TrimSpace(status) {
+	case "Critical":
+		return 3
+	case "Warning":
+		return 2
+	case "OK":
+		return 1
+	default:
+		return 0
+	}
+}
+
+// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
+// and writes component status records to db for the given SAT target.
+// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
+func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
+	if db == nil || strings.TrimSpace(archivePath) == "" {
+		return
+	}
+	archivePath = extractArchivePath(archivePath)
+	if archivePath == "" {
+		return
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return
+	}
+	kv := parseSATKV(string(data))
+	overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
+	if overall == "" {
+		return
+	}
+
+	source := "sat:" + target
+	dbStatus := satStatusToDBStatus(overall)
+
+	// Map SAT target to component keys.
+	switch target {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
+		"amd-stress", "amd-mem", "amd-bandwidth":
+		db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
+	case "memory", "memory-stress", "sat-stress":
+		db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
+	case "cpu", "platform-stress":
+		db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
+	case "storage":
+		// Try to record per-device if available in summary.
+		recordedAny := false
+		for key, val := range kv {
+			if !strings.HasSuffix(key, "_status") || key == "overall_status" {
+				continue
+			}
+			base := strings.TrimSuffix(key, "_status")
+			idx := strings.Index(base, "_")
+			if idx <= 0 {
+				continue
+			}
+			devName := base[:idx]
+			devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
+			db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
+			recordedAny = true
+		}
+		if !recordedAny {
+			db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
+		}
+	}
+}
+
+func satStatusToDBStatus(overall string) string {
+	switch overall {
+	case "OK":
+		return "OK"
+	case "FAILED":
+		return "Warning"
+	case "PARTIAL", "UNSUPPORTED":
+		return "Unknown"
+	default:
+		return "Unknown"
+	}
+}
+
+// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
+// "Archive written to /path/foo.tar.gz" or already a bare path.
+func ExtractArchivePath(s string) string {
+	return extractArchivePath(s)
+}
+
+// ReadSATOverallStatus reads the overall_status value from the summary.txt
+// file located in the run directory alongside archivePath.
+// Returns "" if the file cannot be read.
+func ReadSATOverallStatus(archivePath string) string {
+	if strings.TrimSpace(archivePath) == "" {
+		return ""
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return ""
+	}
+	kv := parseSATKV(string(data))
+	return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
+}
+
+func extractArchivePath(s string) string {
+	s = strings.TrimSpace(s)
+	if strings.HasSuffix(s, ".tar.gz") {
+		parts := strings.Fields(s)
+		if len(parts) > 0 {
+			return parts[len(parts)-1]
+		}
+	}
+	return s
+}
+
+func parseSATKV(raw string) map[string]string {
+	kv := make(map[string]string)
+	for _, line := range strings.Split(raw, "\n") {
+		k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
+		if ok {
+			kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
+		}
+	}
+	return kv
+}
--- a/audit/internal/app/panel.go
+++ b/audit/internal/app/panel.go
@@ -1,387 +0,0 @@
-package app
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"sort"
-	"strings"
-
-	"bee/audit/internal/schema"
-)
-
-// ComponentRow is one line in the hardware panel.
-type ComponentRow struct {
-	Key    string // "CPU", "MEM", "GPU", "DISK", "PSU"
-	Status string // "PASS", "FAIL", "CANCEL", "N/A"
-	Detail string // compact one-liner
-}
-
-// HardwarePanelData holds everything the TUI right panel needs.
-type HardwarePanelData struct {
-	Header []string
-	Rows   []ComponentRow
-}
-
-// LoadHardwarePanel reads the latest audit JSON and SAT summaries.
-// Returns empty panel if no audit data exists yet.
-func (a *App) LoadHardwarePanel() HardwarePanelData {
-	raw, err := os.ReadFile(DefaultAuditJSONPath)
-	if err != nil {
-		return HardwarePanelData{Header: []string{"No audit data — run audit first."}}
-	}
-	var snap schema.HardwareIngestRequest
-	if err := json.Unmarshal(raw, &snap); err != nil {
-		return HardwarePanelData{Header: []string{"Audit data unreadable."}}
-	}
-
-	statuses := satStatuses()
-
-	var header []string
-	if sys := formatSystemLine(snap.Hardware.Board); sys != "" {
-		header = append(header, sys)
-	}
-	for _, fw := range snap.Hardware.Firmware {
-		if fw.DeviceName == "BIOS" && fw.Version != "" {
-			header = append(header, "BIOS: "+fw.Version)
-		}
-		if fw.DeviceName == "BMC" && fw.Version != "" {
-			header = append(header, "BMC:  "+fw.Version)
-		}
-	}
-	if ip := formatIPLine(a.network.ListInterfaces); ip != "" {
-		header = append(header, ip)
-	}
-
-	var rows []ComponentRow
-
-	if cpu := formatCPULine(snap.Hardware.CPUs); cpu != "" {
-		rows = append(rows, ComponentRow{
-			Key:    "CPU",
-			Status: statuses["cpu"],
-			Detail: strings.TrimPrefix(cpu, "CPU: "),
-		})
-	}
-	if mem := formatMemoryLine(snap.Hardware.Memory); mem != "" {
-		rows = append(rows, ComponentRow{
-			Key:    "MEM",
-			Status: statuses["memory"],
-			Detail: strings.TrimPrefix(mem, "Memory: "),
-		})
-	}
-	if gpu := formatGPULine(snap.Hardware.PCIeDevices); gpu != "" {
-		rows = append(rows, ComponentRow{
-			Key:    "GPU",
-			Status: statuses["gpu"],
-			Detail: strings.TrimPrefix(gpu, "GPU: "),
-		})
-	}
-	if disk := formatStorageLine(snap.Hardware.Storage); disk != "" {
-		rows = append(rows, ComponentRow{
-			Key:    "DISK",
-			Status: statuses["storage"],
-			Detail: strings.TrimPrefix(disk, "Storage: "),
-		})
-	}
-	if psu := formatPSULine(snap.Hardware.PowerSupplies); psu != "" {
-		rows = append(rows, ComponentRow{
-			Key:    "PSU",
-			Status: "N/A",
-			Detail: psu,
-		})
-	}
-
-	return HardwarePanelData{Header: header, Rows: rows}
-}
-
-// ComponentDetailResult returns detail text for a component shown in the panel.
-func (a *App) ComponentDetailResult(key string) ActionResult {
-	switch key {
-	case "CPU":
-		return a.cpuDetailResult(false)
-	case "MEM":
-		return a.satDetailResult("memory", "memory-", "MEM detail")
-	case "GPU":
-		// Prefer whichever GPU SAT was run most recently.
-		nv, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-nvidia-*/summary.txt"))
-		am, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-amd-*/summary.txt"))
-		sort.Strings(nv)
-		sort.Strings(am)
-		latestNV := ""
-		if len(nv) > 0 {
-			latestNV = nv[len(nv)-1]
-		}
-		latestAM := ""
-		if len(am) > 0 {
-			latestAM = am[len(am)-1]
-		}
-		if latestAM > latestNV {
-			return a.satDetailResult("gpu", "gpu-amd-", "GPU detail")
-		}
-		return a.satDetailResult("gpu", "gpu-nvidia-", "GPU detail")
-	case "DISK":
-		return a.satDetailResult("storage", "storage-", "DISK detail")
-	case "PSU":
-		return a.psuDetailResult()
-	default:
-		return ActionResult{Title: key, Body: "No detail available."}
-	}
-}
-
-func (a *App) cpuDetailResult(satOnly bool) ActionResult {
-	var b strings.Builder
-
-	// Show latest SAT summary if available.
-	satResult := a.satDetailResult("cpu", "cpu-", "CPU SAT")
-	if satResult.Body != "No test results found. Run a test first." {
-		fmt.Fprintln(&b, "=== Last SAT ===")
-		fmt.Fprintln(&b, satResult.Body)
-		fmt.Fprintln(&b)
-	}
-
-	if satOnly {
-		body := strings.TrimSpace(b.String())
-		if body == "" {
-			body = "No CPU SAT results found. Run a test first."
-		}
-		return ActionResult{Title: "CPU SAT", Body: body}
-	}
-
-	raw, err := os.ReadFile(DefaultAuditJSONPath)
-	if err != nil {
-		return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
-	}
-	var snap schema.HardwareIngestRequest
-	if err := json.Unmarshal(raw, &snap); err != nil {
-		return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
-	}
-	if len(snap.Hardware.CPUs) == 0 {
-		return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
-	}
-	fmt.Fprintln(&b, "=== Audit ===")
-	for i, cpu := range snap.Hardware.CPUs {
-		fmt.Fprintf(&b, "CPU %d\n", i)
-		if cpu.Model != nil {
-			fmt.Fprintf(&b, "  Model:    %s\n", *cpu.Model)
-		}
-		if cpu.Manufacturer != nil {
-			fmt.Fprintf(&b, "  Vendor:   %s\n", *cpu.Manufacturer)
-		}
-		if cpu.Cores != nil {
-			fmt.Fprintf(&b, "  Cores:    %d\n", *cpu.Cores)
-		}
-		if cpu.Threads != nil {
-			fmt.Fprintf(&b, "  Threads:  %d\n", *cpu.Threads)
-		}
-		if cpu.MaxFrequencyMHz != nil {
-			fmt.Fprintf(&b, "  Max freq: %d MHz\n", *cpu.MaxFrequencyMHz)
-		}
-		if cpu.TemperatureC != nil {
-			fmt.Fprintf(&b, "  Temp:     %.1f°C\n", *cpu.TemperatureC)
-		}
-		if cpu.Throttled != nil {
-			fmt.Fprintf(&b, "  Throttled: %v\n", *cpu.Throttled)
-		}
-		if cpu.CorrectableErrorCount != nil && *cpu.CorrectableErrorCount > 0 {
-			fmt.Fprintf(&b, "  ECC correctable:   %d\n", *cpu.CorrectableErrorCount)
-		}
-		if cpu.UncorrectableErrorCount != nil && *cpu.UncorrectableErrorCount > 0 {
-			fmt.Fprintf(&b, "  ECC uncorrectable: %d\n", *cpu.UncorrectableErrorCount)
-		}
-		if i < len(snap.Hardware.CPUs)-1 {
-			fmt.Fprintln(&b)
-		}
-	}
-	return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
-}
-
-func (a *App) satDetailResult(statusKey, prefix, title string) ActionResult {
-	matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, prefix+"*/summary.txt"))
-	if err != nil || len(matches) == 0 {
-		return ActionResult{Title: title, Body: "No test results found. Run a test first."}
-	}
-	sort.Strings(matches)
-	raw, err := os.ReadFile(matches[len(matches)-1])
-	if err != nil {
-		return ActionResult{Title: title, Body: "Could not read test results."}
-	}
-	return ActionResult{Title: title, Body: formatSATDetail(strings.TrimSpace(string(raw)))}
-}
-
-// formatSATDetail converts raw summary.txt key=value content to a human-readable per-step display.
-func formatSATDetail(raw string) string {
-	var b strings.Builder
-	kv := parseKeyValueSummary(raw)
-
-	if t, ok := kv["run_at_utc"]; ok {
-		fmt.Fprintf(&b, "Run: %s\n\n", t)
-	}
-
-	// Collect step names in order they appear in the file
-	lines := strings.Split(raw, "\n")
-	var stepKeys []string
-	seenStep := map[string]bool{}
-	for _, line := range lines {
-		if idx := strings.Index(line, "_status="); idx >= 0 {
-			key := line[:idx]
-			if !seenStep[key] && key != "overall" {
-				seenStep[key] = true
-				stepKeys = append(stepKeys, key)
-			}
-		}
-	}
-
-	for _, key := range stepKeys {
-		status := kv[key+"_status"]
-		display := cleanSummaryKey(key)
-		switch status {
-		case "OK":
-			fmt.Fprintf(&b, "PASS  %s\n", display)
-		case "FAILED":
-			fmt.Fprintf(&b, "FAIL  %s\n", display)
-		case "UNSUPPORTED":
-			fmt.Fprintf(&b, "SKIP  %s\n", display)
-		default:
-			fmt.Fprintf(&b, "?     %s\n", display)
-		}
-	}
-
-	if overall, ok := kv["overall_status"]; ok {
-		ok2 := kv["job_ok"]
-		failed := kv["job_failed"]
-		fmt.Fprintf(&b, "\nOverall: %s  (ok=%s  failed=%s)", overall, ok2, failed)
-	}
-
-	return strings.TrimSpace(b.String())
-}
-
-// cleanSummaryKey strips the leading numeric prefix from a SAT step key.
-// "1-lscpu" → "lscpu", "3-stress-ng" → "stress-ng"
-func cleanSummaryKey(key string) string {
-	idx := strings.Index(key, "-")
-	if idx <= 0 {
-		return key
-	}
-	prefix := key[:idx]
-	for _, c := range prefix {
-		if c < '0' || c > '9' {
-			return key
-		}
-	}
-	return key[idx+1:]
-}
-
-func (a *App) psuDetailResult() ActionResult {
-	raw, err := os.ReadFile(DefaultAuditJSONPath)
-	if err != nil {
-		return ActionResult{Title: "PSU", Body: "No audit data."}
-	}
-	var snap schema.HardwareIngestRequest
-	if err := json.Unmarshal(raw, &snap); err != nil {
-		return ActionResult{Title: "PSU", Body: "Audit data unreadable."}
-	}
-	if len(snap.Hardware.PowerSupplies) == 0 {
-		return ActionResult{Title: "PSU", Body: "No PSU data in last audit."}
-	}
-	var b strings.Builder
-	for i, psu := range snap.Hardware.PowerSupplies {
-		fmt.Fprintf(&b, "PSU %d\n", i)
-		if psu.Model != nil {
-			fmt.Fprintf(&b, "  Model:   %s\n", *psu.Model)
-		}
-		if psu.Vendor != nil {
-			fmt.Fprintf(&b, "  Vendor:  %s\n", *psu.Vendor)
-		}
-		if psu.WattageW != nil {
-			fmt.Fprintf(&b, "  Rated:   %d W\n", *psu.WattageW)
-		}
-		if psu.InputPowerW != nil {
-			fmt.Fprintf(&b, "  Input:   %.1f W\n", *psu.InputPowerW)
-		}
-		if psu.OutputPowerW != nil {
-			fmt.Fprintf(&b, "  Output:  %.1f W\n", *psu.OutputPowerW)
-		}
-		if psu.TemperatureC != nil {
-			fmt.Fprintf(&b, "  Temp:    %.1f°C\n", *psu.TemperatureC)
-		}
-		if i < len(snap.Hardware.PowerSupplies)-1 {
-			fmt.Fprintln(&b)
-		}
-	}
-	return ActionResult{Title: "PSU", Body: strings.TrimSpace(b.String())}
-}
-
-// satStatuses reads the latest summary.txt for each SAT type and returns
-// a map of component key ("gpu","memory","storage") → status ("PASS","FAIL","CANCEL","N/A").
-func satStatuses() map[string]string {
-	result := map[string]string{
-		"gpu":     "N/A",
-		"memory":  "N/A",
-		"storage": "N/A",
-		"cpu":     "N/A",
-	}
-	patterns := []struct {
-		key    string
-		prefix string
-	}{
-		{"gpu", "gpu-nvidia-"},
-		{"gpu", "gpu-amd-"},
-		{"memory", "memory-"},
-		{"storage", "storage-"},
-		{"cpu", "cpu-"},
-	}
-	for _, item := range patterns {
-		matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
-		if err != nil || len(matches) == 0 {
-			continue
-		}
-		sort.Strings(matches)
-		raw, err := os.ReadFile(matches[len(matches)-1])
-		if err != nil {
-			continue
-		}
-		values := parseKeyValueSummary(string(raw))
-		switch strings.ToUpper(strings.TrimSpace(values["overall_status"])) {
-		case "OK":
-			result[item.key] = "PASS"
-		case "FAILED":
-			result[item.key] = "FAIL"
-		case "CANCELED", "CANCELLED":
-			result[item.key] = "CANCEL"
-		}
-	}
-	return result
-}
-
-func formatPSULine(psus []schema.HardwarePowerSupply) string {
-	var present []schema.HardwarePowerSupply
-	for _, psu := range psus {
-		if psu.Present != nil && !*psu.Present {
-			continue
-		}
-		present = append(present, psu)
-	}
-	if len(present) == 0 {
-		return ""
-	}
-	firstW := 0
-	if present[0].WattageW != nil {
-		firstW = *present[0].WattageW
-	}
-	allSame := firstW > 0
-	for _, p := range present[1:] {
-		w := 0
-		if p.WattageW != nil {
-			w = *p.WattageW
-		}
-		if w != firstW {
-			allSame = false
-			break
-		}
-	}
-	if allSame && firstW > 0 {
-		return fmt.Sprintf("%dx %dW", len(present), firstW)
-	}
-	return fmt.Sprintf("%d PSU", len(present))
-}
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -3,13 +3,14 @@ package app
 import (
 	"os"
 	"path/filepath"
+	"strconv"
 	"sort"
 	"strings"

 	"bee/audit/internal/schema"
 )

-func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
+func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
 	if snap == nil || strings.TrimSpace(baseDir) == "" {
 		return
 	}
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
 		applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
+		applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
 		applyMemorySAT(snap.Memory, summary)
@@ -28,6 +30,102 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
 	if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
 		applyStorageSAT(snap.Storage, summary)
 	}
+	// Apply unified component status DB — overlaid last so it can only upgrade severity.
+	applyComponentStatusDB(snap, db)
+}
+
+type nvidiaPerGPUStatus struct {
+	runStatus string
+	reason    string
+}
+
+func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
+	statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
+	if !ok {
+		return
+	}
+	for i := range devs {
+		if devs[i].Telemetry == nil {
+			continue
+		}
+		rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
+		if !ok {
+			continue
+		}
+		idx, ok := telemetryInt(rawIdx)
+		if !ok {
+			continue
+		}
+		st, ok := statusByIndex[idx]
+		if !ok {
+			continue
+		}
+		status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
+		if !ok {
+			continue
+		}
+		mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
+	}
+}
+
+func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
+	matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
+	if err != nil || len(matches) == 0 {
+		return nil, "", false
+	}
+	sort.Strings(matches)
+	runDir := matches[len(matches)-1]
+	summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return nil, "", false
+	}
+	summaryKV := parseKeyValueSummary(string(summaryRaw))
+	runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
+	files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
+	if err != nil || len(files) == 0 {
+		return nil, "", false
+	}
+	out := make(map[int]nvidiaPerGPUStatus, len(files))
+	for _, file := range files {
+		raw, err := os.ReadFile(file)
+		if err != nil {
+			continue
+		}
+		kv := parseKeyValueSummary(string(raw))
+		idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
+		if err != nil {
+			continue
+		}
+		out[idx] = nvidiaPerGPUStatus{
+			runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
+			reason:    strings.TrimSpace(kv["reason"]),
+		}
+	}
+	if len(out) == 0 {
+		return nil, "", false
+	}
+	return out, runAtUTC, true
+}
+
+func telemetryInt(v any) (int, bool) {
+	switch value := v.(type) {
+	case int:
+		return value, true
+	case int32:
+		return int(value), true
+	case int64:
+		return int(value), true
+	case float64:
+		return int(value), true
+	case string:
+		n, err := strconv.Atoi(strings.TrimSpace(value))
+		if err != nil {
+			return 0, false
+		}
+		return n, true
+	default:
+		return 0, false
+	}
 }

 type satSummary struct {
@@ -141,9 +239,11 @@ func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
 func satKeyStatus(rawStatus, label string) (string, string, bool) {
 	switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
 	case "OK":
-		return "OK", label + " passed", true
+		// No error description on success — error_description is for problems only.
+		return "OK", "", true
 	case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
-		return "Warning", label + " incomplete", true
+		// Tool couldn't run or test was incomplete — we can't assert hardware health.
+		return "Unknown", "", true
 	case "FAILED":
 		return "Critical", label + " failed", true
 	default:
@@ -172,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
 	}
 }

+func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
+	if component == nil || satStatus == "" {
+		return
+	}
+	current := strings.TrimSpace(ptrString(component.Status))
+	newSeverity := statusSeverity(satStatus)
+	currentSeverity := statusSeverity(current)
+	if current == "" || current == "Unknown" || newSeverity > currentSeverity {
+		mergeComponentStatus(component, changedAt, satStatus, description)
+		return
+	}
+	if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
+		component.Status = appStringPtr(satStatus)
+		component.ErrorDescription = appStringPtr(description)
+		if strings.TrimSpace(changedAt) != "" {
+			component.StatusChangedAt = appStringPtr(changedAt)
+			component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
+				Status:    satStatus,
+				ChangedAt: changedAt,
+				Details:   appStringPtr(description),
+			})
+		}
+	}
+}
+
 func statusSeverity(status string) int {
 	switch strings.TrimSpace(status) {
 	case "Critical":
@@ -180,6 +305,8 @@ func statusSeverity(status string) int {
 		return 2
 	case "OK":
 		return 1
+	case "Unknown":
+		return 1 // same as OK — does not override OK from another source
 	default:
 		return 0
 	}
@@ -202,6 +329,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
 	}
 }

+func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
+	if snap == nil || db == nil {
+		return
+	}
+	for _, rec := range db.All() {
+		key := rec.ComponentKey
+		status := dbStatusToSATStatus(rec.Status)
+		if status == "" {
+			continue
+		}
+		detail := rec.ErrorSummary
+		ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
+
+		switch {
+		case strings.HasPrefix(key, "pcie:"):
+			bdf := strings.TrimPrefix(key, "pcie:")
+			bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
+			// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
+			if sanitizeBDFForLookup(bdf) == "" {
+				break
+			}
+			normalized := sanitizeBDFForLookup(bdf)
+			for i := range snap.PCIeDevices {
+				if snap.PCIeDevices[i].BDF == nil {
+					continue
+				}
+				if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
+					mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
+				}
+			}
+		case strings.HasPrefix(key, "storage:"):
+			devName := strings.TrimPrefix(key, "storage:")
+			if devName == "all" {
+				for i := range snap.Storage {
+					mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
+				}
+			} else {
+				for i := range snap.Storage {
+					linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
+					if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
+						mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
+					}
+				}
+			}
+		case strings.HasPrefix(key, "memory:"):
+			for i := range snap.Memory {
+				mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
+			}
+		case strings.HasPrefix(key, "cpu:"):
+			for i := range snap.CPUs {
+				mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
+			}
+		}
+	}
+}
+
+// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
+// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
+func dbStatusToSATStatus(s string) string {
+	switch strings.TrimSpace(s) {
+	case "OK", "Warning", "Critical", "Unknown":
+		return s
+	default:
+		return ""
+	}
+}
+
+// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
+// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
+func sanitizeBDFForLookup(bdf string) string {
+	bdf = strings.ToLower(strings.TrimSpace(bdf))
+	if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
+		return ""
+	}
+	if strings.Count(bdf, ":") == 1 {
+		bdf = "0000:" + bdf
+	}
+	return bdf
+}
+
 func ptrString(v *string) string {
 	if v == nil {
 		return ""
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
 	usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
 	snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}

-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)

 	if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
 		t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
@@ -53,9 +53,57 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		}},
 	}

-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)

 	if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
 	}
 }
+
+func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
+	baseDir := t.TempDir()
+	runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	class := "VideoController"
+	manufacturer := "NVIDIA Corporation"
+	bdf0 := "0000:4b:00.0"
+	bdf1 := "0000:4f:00.0"
+	snap := schema.HardwareSnapshot{
+		PCIeDevices: []schema.HardwarePCIeDevice{
+			{
+				DeviceClass:  &class,
+				Manufacturer: &manufacturer,
+				BDF:          &bdf0,
+				Telemetry:    map[string]any{"nvidia_gpu_index": 0},
+			},
+			{
+				DeviceClass:  &class,
+				Manufacturer: &manufacturer,
+				BDF:          &bdf1,
+				Telemetry:    map[string]any{"nvidia_gpu_index": 1},
+			},
+		},
+	}
+
+	applyLatestSATStatuses(&snap, baseDir, nil)
+
+	if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
+		t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
+	}
+	if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
+		got := "<nil>"
+		if snap.PCIeDevices[1].ErrorDescription != nil {
+			got = *snap.PCIeDevices[1].ErrorDescription
+		}
+		t.Fatalf("gpu1 error=%q want per-gpu reason", got)
+	}
+}
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
 	"bee-network.service",
 	"bee-nvidia.service",
 	"bee-preflight.service",
+	"bee-selfheal.service",
+	"bee-selfheal.timer",
 	"bee-sshsetup.service",
 }

@@ -27,15 +29,176 @@ var supportBundleCommands = []struct {
 	cmd  []string
 }{
 	{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
+	{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
 	{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
 	{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
+	{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
 	{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
+	{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
+	{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
 	{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
-	{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
+	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
+	{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
+if command -v dmesg >/dev/null 2>&1; then
+  dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
+else
+  echo "dmesg not found"
+fi
+`}},
+	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
+if ! command -v lspci >/dev/null 2>&1; then
+  echo "lspci not found"
+  exit 0
+fi
+found=0
+	for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
+  found=1
+  echo "=== GPU $gpu ==="
+  lspci -s "$gpu" -vv 2>&1 || true
+  bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
+  if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
+    echo
+    echo "=== UPSTREAM $bridge for $gpu ==="
+    lspci -s "$bridge" -vv 2>&1 || true
+  fi
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no NVIDIA PCI devices found"
+fi
+`}},
+	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
+for d in /sys/bus/pci/devices/*/; do
+  vendor=$(cat "$d/vendor" 2>/dev/null)
+	  [ "$vendor" = "0x10de" ] || continue
+	  class=$(cat "$d/class" 2>/dev/null)
+	  case "$class" in
+	    0x030000|0x030200) ;;
+	    *) continue ;;
+	  esac
+	  dev=$(basename "$d")
+  echo "=== $dev ==="
+  for f in current_link_speed current_link_width max_link_speed max_link_width; do
+    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
+  done
+done
+`}},
+	{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
+found=0
+for dev in /sys/bus/pci/devices/*; do
+  [ -e "$dev" ] || continue
+  bdf=$(basename "$dev")
+  block=""
+  for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
+    if [ -r "$dev/$f" ]; then
+      if [ -z "$block" ]; then
+        block=1
+        found=1
+        echo "=== $bdf ==="
+      fi
+      printf "  %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
+    fi
+  done
+  if [ -n "$block" ]; then
+    echo
+  fi
+done
+if [ "$found" -eq 0 ]; then
+  echo "no PCIe AER sysfs counters found"
+fi
+`}},
+	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool -i "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool -m "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
+if ! command -v mstflint >/dev/null 2>&1; then
+  echo "mstflint not found"
+  exit 0
+fi
+found=0
+for path in /sys/bus/pci/devices/*; do
+  [ -e "$path/vendor" ] || continue
+  vendor=$(cat "$path/vendor" 2>/dev/null)
+  [ "$vendor" = "0x15b3" ] || continue
+  bdf=$(basename "$path")
+  found=1
+  echo "=== $bdf ==="
+  mstflint -d "$bdf" q 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no Mellanox/NVIDIA networking devices found"
+fi
+`}},
 }

+var supportBundleOptionalFiles = []struct {
+	name string
+	src  string
+}{
+	{name: "system/kern.log", src: "/var/log/kern.log"},
+	{name: "system/syslog.txt", src: "/var/log/syslog"},
+}
+
+const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
+
 func BuildSupportBundle(exportDir string) (string, error) {
 	exportDir = strings.TrimSpace(exportDir)
 	if exportDir == "" {
@@ -48,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	host := sanitizeFilename(hostnameOr("unknown"))
-	ts := time.Now().UTC().Format("20060102-150405")
-	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
+	now := time.Now().UTC()
+	date := now.Format("2006-01-02")
+	tod := now.Format("150405")
+	ver := bundleVersion()
+	model := serverModelForBundle()
+	sn := serverSerialForBundle()
+
+	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
 		return "", err
 	}
@@ -75,45 +243,79 @@ func BuildSupportBundle(exportDir string) (string, error) {
 			return "", err
 		}
 	}
+	for _, item := range supportBundleOptionalFiles {
+		_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
+	}
 	if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
 		return "", err
 	}

-	archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
+	archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
+	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
 	}
 	return archivePath, nil
 }

+func LatestSupportBundlePath() (string, error) {
+	return latestSupportBundlePath(os.TempDir())
+}
+
 func cleanupOldSupportBundles(dir string) error {
-	matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
+	matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
 	if err != nil {
 		return err
 	}
-	type entry struct {
-		path string
-		mod  time.Time
+	entries := supportBundleEntries(matches)
+	for path, mod := range entries {
+		if time.Since(mod) > 24*time.Hour {
+			_ = os.Remove(path)
+			delete(entries, path)
+		}
 	}
-	list := make([]entry, 0, len(matches))
+	ordered := orderSupportBundles(entries)
+	if len(ordered) > 3 {
+		for _, old := range ordered[3:] {
+			_ = os.Remove(old)
+		}
+	}
+	return nil
+}
+
+func latestSupportBundlePath(dir string) (string, error) {
+	matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
+	if err != nil {
+		return "", err
+	}
+	ordered := orderSupportBundles(supportBundleEntries(matches))
+	if len(ordered) == 0 {
+		return "", os.ErrNotExist
+	}
+	return ordered[0], nil
+}
+
+func supportBundleEntries(matches []string) map[string]time.Time {
+	entries := make(map[string]time.Time, len(matches))
 	for _, match := range matches {
 		info, err := os.Stat(match)
 		if err != nil {
 			continue
 		}
-		if time.Since(info.ModTime()) > 24*time.Hour {
-			_ = os.Remove(match)
-			continue
-		}
-		list = append(list, entry{path: match, mod: info.ModTime()})
+		entries[match] = info.ModTime()
 	}
-	sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
-	if len(list) > 3 {
-		for _, old := range list[3:] {
-			_ = os.Remove(old.path)
-		}
+	return entries
+}
+
+func orderSupportBundles(entries map[string]time.Time) []string {
+	ordered := make([]string, 0, len(entries))
+	for path := range entries {
+		ordered = append(ordered, path)
 	}
-	return nil
+	sort.Slice(ordered, func(i, j int) bool {
+		return entries[ordered[i]].After(entries[ordered[j]])
+	})
+	return ordered
 }

 func writeJournalDump(dst string) error {
@@ -152,6 +354,24 @@ func writeCommandOutput(dst string, cmd []string) error {
 	return os.WriteFile(dst, raw, 0644)
 }

+func copyOptionalFile(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+	if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
+		return err
+	}
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+	_, err = io.Copy(out, in)
+	return err
+}
+
 func writeManifest(dst, exportDir, stageRoot string) error {
 	if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
 		return err
@@ -188,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	return os.WriteFile(dst, []byte(body.String()), 0644)
 }

+func bundleVersion() string {
+	v := buildVersion()
+	v = strings.TrimPrefix(v, "v")
+	v = strings.TrimPrefix(v, "V")
+	if v == "" || v == "unknown" {
+		return "0.0"
+	}
+	return v
+}
+
+func serverModelForBundle() string {
+	raw, err := exec.Command("dmidecode", "-t", "1").Output()
+	if err != nil {
+		return "unknown"
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		key, val, ok := strings.Cut(line, ": ")
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(key) == "Product Name" {
+			val = strings.TrimSpace(val)
+			if val == "" {
+				return "unknown"
+			}
+			return strings.ReplaceAll(val, " ", "_")
+		}
+	}
+	return "unknown"
+}
+
+func serverSerialForBundle() string {
+	raw, err := exec.Command("dmidecode", "-t", "1").Output()
+	if err != nil {
+		return "unknown"
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		key, val, ok := strings.Cut(line, ": ")
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(key) == "Serial Number" {
+			val = strings.TrimSpace(val)
+			if val == "" {
+				return "unknown"
+			}
+			return val
+		}
+	}
+	return "unknown"
+}
+
 func buildVersion() string {
 	raw, err := exec.Command("bee", "version").CombinedOutput()
 	if err != nil {
@@ -215,7 +489,7 @@ func copyDirContents(srcDir, dstDir string) error {
 }

 func copyExportDirForSupportBundle(srcDir, dstDir string) error {
-	return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
+	if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
 		cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
 		if cleanRel == "" {
 			return true
@@ -227,7 +501,25 @@ func copyExportDirForSupportBundle(srcDir, dstDir string) error {
 			return false
 		}
 		return true
-	})
+	}); err != nil {
+		return err
+	}
+	return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
+}
+
+func normalizeSupportBundleAuditJSON(path string) error {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+	normalized, err := ApplySATOverlay(data)
+	if err != nil {
+		return nil
+	}
+	return os.WriteFile(path, normalized, 0644)
 }

 func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
--- a/audit/internal/collector/finalize.go
+++ b/audit/internal/collector/finalize.go
@@ -1,10 +1,18 @@
 package collector

-import "bee/audit/internal/schema"
+import (
+	"bee/audit/internal/schema"
+	"strings"
+)
+
+func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
+	finalizeSnapshot(snap, collectedAt)
+}

 func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
 	snap.Memory = filterMemory(snap.Memory)
 	snap.Storage = filterStorage(snap.Storage)
+	snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
 	snap.PowerSupplies = filterPSUs(snap.PowerSupplies)

 	setComponentStatusMetadata(snap, collectedAt)
@@ -33,11 +41,25 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
 		if disk.SerialNumber == nil || *disk.SerialNumber == "" {
 			continue
 		}
+		if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
+			continue
+		}
 		out = append(out, disk)
 	}
 	return out
 }

+func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
+	out := make([]schema.HardwarePCIeDevice, 0, len(devs))
+	for _, dev := range devs {
+		if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
+			continue
+		}
+		out = append(out, dev)
+	}
+	return out
+}
+
 func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
 	out := make([]schema.HardwarePowerSupply, 0, len(psus))
 	for _, psu := range psus {
--- a/audit/internal/collector/finalize_test.go
+++ b/audit/internal/collector/finalize_test.go
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 	present := true
 	status := statusOK
 	serial := "SN-1"
+	virtualModel := "Virtual HDisk1"
+	realModel := "PASCARI"
+	coProcessorClass := "Co-processor"
+	gpuClass := "VideoController"

 	snap := schema.HardwareSnapshot{
 		Memory: []schema.HardwareMemory{
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 			{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 		},
 		Storage: []schema.HardwareStorage{
+			{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
+			{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 		},
+		PCIeDevices: []schema.HardwarePCIeDevice{
+			{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
+			{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
+		},
 		PowerSupplies: []schema.HardwarePowerSupply{
 			{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 	if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
 	}
-	if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
+	if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
 	}
+	if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
+		t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
+	}
 	if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
 	}
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -2,18 +2,21 @@ package collector

 import (
 	"bee/audit/internal/schema"
+	"context"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"time"
 )

 const mellanoxVendorID = 0x15b3
+const nicProbeTimeout = 2 * time.Second

 var (
 	mstflintQuery = func(bdf string) (string, error) {
-		out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
 		if err != nil {
 			return "", err
 		}
@@ -21,7 +24,7 @@ var (
 	}

 	ethtoolInfoQuery = func(iface string) (string, error) {
-		out, err := exec.Command("ethtool", "-i", iface).Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
 		if err != nil {
 			return "", err
 		}
@@ -29,6 +32,14 @@ var (
 	}

 	netIfacesByBDF = listNetIfacesByBDF
+	readNetCarrierFile = func(iface string) (string, error) {
+		path := filepath.Join("/sys/class/net", iface, "carrier")
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			return "", err
+		}
+		return strings.TrimSpace(string(raw)), nil
+	}
 )

 // enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
@@ -162,3 +173,9 @@ func listNetIfacesByBDF(bdf string) []string {
 	}
 	return ifaces
 }
+
+func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	return exec.CommandContext(ctx, name, args...).Output()
+}
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -12,7 +12,7 @@ import (

 var (
 	ethtoolModuleQuery = func(iface string) (string, error) {
-		out, err := raidToolQuery("ethtool", "-m", iface)
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
 		if err != nil {
 			return "", err
 		}
@@ -113,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 		}
 		key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
 		val := strings.TrimSpace(trimmed[idx+1:])
+		if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
+			continue
+		}

 		switch {
+		case key == "identifier":
+			s := parseSFPIdentifier(val)
+			dev.SFPIdentifier = &s
+			t := true
+			dev.SFPPresent = &t
+			changed = true
+		case key == "connector":
+			s := parseSFPConnector(val)
+			dev.SFPConnector = &s
+			changed = true
+		case key == "vendor name":
+			s := strings.TrimSpace(val)
+			dev.SFPVendor = &s
+			changed = true
+		case key == "vendor pn":
+			s := strings.TrimSpace(val)
+			dev.SFPPartNumber = &s
+			changed = true
+		case key == "vendor sn":
+			s := strings.TrimSpace(val)
+			dev.SFPSerialNumber = &s
+			changed = true
+		case strings.Contains(key, "laser wavelength"):
+			if f, ok := firstFloat(val); ok {
+				dev.SFPWavelengthNM = &f
+				changed = true
+			}
 		case strings.Contains(key, "module temperature"):
 			if f, ok := firstFloat(val); ok {
 				dev.SFPTemperatureC = &f
@@ -145,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 	return changed
 }

+// parseSFPIdentifier extracts the human-readable transceiver type from the
+// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
+func parseSFPIdentifier(val string) string {
+	if s := extractParens(val); s != "" {
+		return s
+	}
+	return val
+}
+
+// parseSFPConnector extracts the connector type from the raw ethtool line,
+// e.g. "0x07 (LC)" → "LC".
+func parseSFPConnector(val string) string {
+	if s := extractParens(val); s != "" {
+		return s
+	}
+	return val
+}
+
+var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
+
+func extractParens(s string) string {
+	m := parenRe.FindStringSubmatch(s)
+	if len(m) < 2 {
+		return ""
+	}
+	return strings.TrimSpace(m[1])
+}
+
 func parseSFPDOM(raw string) map[string]any {
 	dev := schema.HardwarePCIeDevice{}
 	if !injectSFPDOMTelemetry(&dev, raw) {
 		return map[string]any{}
 	}
 	out := map[string]any{}
+	if dev.SFPPresent != nil {
+		out["sfp_present"] = *dev.SFPPresent
+	}
+	if dev.SFPIdentifier != nil {
+		out["sfp_identifier"] = *dev.SFPIdentifier
+	}
+	if dev.SFPConnector != nil {
+		out["sfp_connector"] = *dev.SFPConnector
+	}
+	if dev.SFPVendor != nil {
+		out["sfp_vendor"] = *dev.SFPVendor
+	}
+	if dev.SFPPartNumber != nil {
+		out["sfp_part_number"] = *dev.SFPPartNumber
+	}
+	if dev.SFPSerialNumber != nil {
+		out["sfp_serial_number"] = *dev.SFPSerialNumber
+	}
+	if dev.SFPWavelengthNM != nil {
+		out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
+	}
 	if dev.SFPTemperatureC != nil {
 		out["sfp_temperature_c"] = *dev.SFPTemperatureC
 	}
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
+	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		queryPCILSPCIDetail = origDetail
 		readPCIVPDFile = origVPD
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
+		readNetCarrierFile = origCarrier
 	})

 	queryPCILSPCIDetail = func(bdf string) (string, error) {
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		}
 		return "aa:bb:cc:dd:ee:ff", nil
 	}
+	readNetCarrierFile = func(string) (string, error) { return "1", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }

@@ -101,6 +104,39 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	}
 }

+func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
+	origIfaces := netIfacesByBDF
+	origReadMAC := readNetAddressFile
+	origEth := ethtoolInfoQuery
+	origModule := ethtoolModuleQuery
+	origCarrier := readNetCarrierFile
+	t.Cleanup(func() {
+		netIfacesByBDF = origIfaces
+		readNetAddressFile = origReadMAC
+		ethtoolInfoQuery = origEth
+		ethtoolModuleQuery = origModule
+		readNetCarrierFile = origCarrier
+	})
+
+	netIfacesByBDF = func(string) []string { return []string{"eth0"} }
+	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
+	readNetCarrierFile = func(string) (string, error) { return "0", nil }
+	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
+	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
+
+	class := "EthernetController"
+	bdf := "0000:18:00.0"
+	devs := []schema.HardwarePCIeDevice{{
+		DeviceClass: &class,
+		BDF:         &bdf,
+	}}
+
+	out := enrichPCIeWithNICTelemetry(devs)
+	if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
+		t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
+	}
+}
+
 func TestDBMValue(t *testing.T) {
 	tests := []struct {
 		in   string
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -13,14 +13,20 @@ import (
 const nvidiaVendorID = 0x10de

 type nvidiaGPUInfo struct {
-	BDF            string
-	Serial         string
-	VBIOS          string
-	TemperatureC   *float64
-	PowerW         *float64
-	ECCUncorrected *int64
-	ECCCorrected   *int64
-	HWSlowdown     *bool
+	Index              int
+	BDF                string
+	Name               string
+	Serial             string
+	VBIOS              string
+	TemperatureC       *float64
+	PowerW             *float64
+	ECCUncorrected     *int64
+	ECCCorrected       *int64
+	HWSlowdown         *bool
+	PCIeLinkGenCurrent *int
+	PCIeLinkGenMax     *int
+	PCIeLinkWidthCur   *int
+	PCIeLinkWidthMax   *int
 }

 // enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
@@ -68,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 			continue
 		}

+		if v := strings.TrimSpace(info.Name); v != "" {
+			devs[i].Model = &v
+		}
 		if v := strings.TrimSpace(info.Serial); v != "" {
 			devs[i].SerialNumber = &v
 		}
@@ -94,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
 	out, err := exec.Command(
 		"nvidia-smi",
-		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
+		"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
@@ -118,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		if len(rec) == 0 {
 			continue
 		}
-		if len(rec) < 9 {
-			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
+		if len(rec) < 14 {
+			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
 		}

 		bdf := normalizePCIeBDF(rec[1])
@@ -128,14 +137,20 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		}

 		info := nvidiaGPUInfo{
-			BDF:            bdf,
-			Serial:         strings.TrimSpace(rec[2]),
-			VBIOS:          strings.TrimSpace(rec[3]),
-			TemperatureC:   parseMaybeFloat(rec[4]),
-			PowerW:         parseMaybeFloat(rec[5]),
-			ECCUncorrected: parseMaybeInt64(rec[6]),
-			ECCCorrected:   parseMaybeInt64(rec[7]),
-			HWSlowdown:     parseMaybeBool(rec[8]),
+			Index:              parseRequiredInt(rec[0]),
+			BDF:                bdf,
+			Name:               strings.TrimSpace(rec[2]),
+			Serial:             strings.TrimSpace(rec[3]),
+			VBIOS:              strings.TrimSpace(rec[4]),
+			TemperatureC:       parseMaybeFloat(rec[5]),
+			PowerW:             parseMaybeFloat(rec[6]),
+			ECCUncorrected:     parseMaybeInt64(rec[7]),
+			ECCCorrected:       parseMaybeInt64(rec[8]),
+			HWSlowdown:         parseMaybeBool(rec[9]),
+			PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
+			PCIeLinkGenMax:     parseMaybeInt(rec[11]),
+			PCIeLinkWidthCur:   parseMaybeInt(rec[12]),
+			PCIeLinkWidthMax:   parseMaybeInt(rec[13]),
 		}
 		result[bdf] = info
 	}
@@ -167,6 +182,30 @@ func parseMaybeInt64(v string) *int64 {
 	return &n
 }

+func parseMaybeInt(v string) *int {
+	v = strings.TrimSpace(v)
+	if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
+		return nil
+	}
+	n, err := strconv.Atoi(v)
+	if err != nil {
+		return nil
+	}
+	return &n
+}
+
+func parseRequiredInt(v string) int {
+	n, err := strconv.Atoi(strings.TrimSpace(v))
+	if err != nil {
+		return 0
+	}
+	return n
+}
+
+func pcieLinkGenLabel(gen int) string {
+	return fmt.Sprintf("Gen%d", gen)
+}
+
 func parseMaybeBool(v string) *bool {
 	v = strings.TrimSpace(strings.ToLower(v))
 	switch v {
@@ -216,6 +255,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
 }

 func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
+	if dev.Telemetry == nil {
+		dev.Telemetry = map[string]any{}
+	}
+	dev.Telemetry["nvidia_gpu_index"] = info.Index
 	if info.TemperatureC != nil {
 		dev.TemperatureC = info.TemperatureC
 	}
@@ -231,4 +274,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
 	if info.HWSlowdown != nil {
 		dev.HWSlowdown = info.HWSlowdown
 	}
+	// Override PCIe link speed/width with nvidia-smi driver values.
+	// sysfs current_link_speed reflects the instantaneous physical link state and
+	// can show Gen1 when the GPU is idle due to ASPM power management. The driver
+	// knows the negotiated speed regardless of the current power state.
+	if info.PCIeLinkGenCurrent != nil {
+		s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
+		dev.LinkSpeed = &s
+	}
+	if info.PCIeLinkGenMax != nil {
+		s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
+		dev.MaxLinkSpeed = &s
+	}
+	if info.PCIeLinkWidthCur != nil {
+		dev.LinkWidth = info.PCIeLinkWidthCur
+	}
+	if info.PCIeLinkWidthMax != nil {
+		dev.MaxLinkWidth = info.PCIeLinkWidthMax
+	}
 }
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -6,7 +6,7 @@ import (
 )

 func TestParseNVIDIASMIQuery(t *testing.T) {
-	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
+	raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
 	byBDF, err := parseNVIDIASMIQuery(raw)
 	if err != nil {
 		t.Fatalf("parse failed: %v", err)
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
 	if !ok {
 		t.Fatalf("gpu by normalized bdf not found")
 	}
+	if gpu.Name != "NVIDIA H100 80GB HBM3" {
+		t.Fatalf("name: got %q", gpu.Name)
+	}
 	if gpu.Serial != "GPU-SERIAL-1" {
 		t.Fatalf("serial: got %q", gpu.Serial)
 	}
@@ -28,6 +31,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
 	if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
 		t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
 	}
+	if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
+		t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
+	}
+	if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
+		t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
+	}
 }

 func TestNormalizePCIeBDF(t *testing.T) {
@@ -80,6 +89,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
 	if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
 		t.Fatalf("firmware: got %v", out[0].Firmware)
 	}
+	if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
+		t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
+	}
 	if out[0].Status == nil || *out[0].Status != statusWarning {
 		t.Fatalf("status: got %v", out[0].Status)
 	}
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -2,6 +2,7 @@ package collector

 import (
 	"bee/audit/internal/schema"
+	"fmt"
 	"log/slog"
 	"os/exec"
 	"strconv"
@@ -59,6 +60,7 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
 		"host bridge",
 		"isa bridge",
 		"pci bridge",
+		"co-processor",
 		"performance counter",
 		"performance counters",
 		"ram memory",
@@ -78,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
 		}
 	}

+	// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
+	// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
+	if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
+		bmcPatterns := []string{
+			"management system chip",
+			"management controller",
+			"ibmc",
+			"idrac",
+			"ilo vga",
+			"aspeed",
+			"matrox",
+		}
+		for _, bad := range bmcPatterns {
+			if strings.Contains(d, bad) {
+				return false
+			}
+		}
+	}
+
 	if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
 		internalAMDPatterns := []string{
 			"dummy function",
@@ -152,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {

 	// SVendor/SDevice available but not in schema — skip

+	// Warn if PCIe link is running below its maximum negotiated speed.
+	applyPCIeLinkSpeedWarning(&dev)
+
 	return dev
 }

@@ -221,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
 	return value, true
 }

+// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
+// speed is below the maximum negotiated speed supported by both ends.
+func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
+	if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
+		return
+	}
+	if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
+		warn := statusWarning
+		dev.Status = &warn
+		desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
+		dev.ErrorDescription = &desc
+	}
+}
+
+// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
+// Returns 0 for unrecognised values so comparisons fail safe.
+func pcieLinkSpeedRank(gen string) int {
+	switch gen {
+	case "Gen1":
+		return 1
+	case "Gen2":
+		return 2
+	case "Gen3":
+		return 3
+	case "Gen4":
+		return 4
+	case "Gen5":
+		return 5
+	case "Gen6":
+		return 6
+	default:
+		return 0
+	}
+}
+
 func normalizePCILinkSpeed(raw string) string {
 	raw = strings.TrimSpace(strings.ToLower(raw))
 	switch {
--- a/audit/internal/collector/pcie_filter_test.go
+++ b/audit/internal/collector/pcie_filter_test.go
@@ -1,6 +1,7 @@
 package collector

 import (
+	"bee/audit/internal/schema"
 	"encoding/json"
 	"strings"
 	"testing"
@@ -19,6 +20,7 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
 		{name: "audio", class: "Audio device", want: false},
 		{name: "host bridge", class: "Host bridge", want: false},
 		{name: "pci bridge", class: "PCI bridge", want: false},
+		{name: "co-processor", class: "Co-processor", want: false},
 		{name: "smbus", class: "SMBus", want: false},
 		{name: "perf", class: "Performance counters", want: false},
 		{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
@@ -28,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
 		{name: "raid", class: "RAID bus controller", want: true},
 		{name: "nvme", class: "Non-Volatile memory controller", want: true},
 		{name: "vga", class: "VGA compatible controller", want: true},
+		{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
+		{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
 		{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
 	}

@@ -76,6 +80,20 @@ func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
 	}
 }

+func TestParseLspci_filtersCoProcessors(t *testing.T) {
+	input := "" +
+		"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
+		"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
+
+	devs := parseLspci(input)
+	if len(devs) != 1 {
+		t.Fatalf("expected 1 remaining device, got %d", len(devs))
+	}
+	if devs[0].Model == nil || *devs[0].Model != "H100" {
+		t.Fatalf("unexpected remaining device: %+v", devs[0])
+	}
+}
+
 func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
 	input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"

@@ -124,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
 		}
 	}
 }
+
+func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
+	ptr := func(s string) *string { return &s }
+
+	tests := []struct {
+		name        string
+		linkSpeed   *string
+		maxSpeed    *string
+		wantWarning bool
+		wantGenIn   string // substring expected in ErrorDescription when warning
+	}{
+		{
+			name:        "degraded Gen1 vs Gen5",
+			linkSpeed:   ptr("Gen1"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: true,
+			wantGenIn:   "Gen1",
+		},
+		{
+			name:        "at max Gen5",
+			linkSpeed:   ptr("Gen5"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: false,
+		},
+		{
+			name:        "degraded Gen4 vs Gen5",
+			linkSpeed:   ptr("Gen4"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: true,
+			wantGenIn:   "Gen4",
+		},
+		{
+			name:        "missing current speed — no warning",
+			linkSpeed:   nil,
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: false,
+		},
+		{
+			name:        "missing max speed — no warning",
+			linkSpeed:   ptr("Gen1"),
+			maxSpeed:    nil,
+			wantWarning: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			dev := schema.HardwarePCIeDevice{}
+			ok := statusOK
+			dev.Status = &ok
+			dev.LinkSpeed = tt.linkSpeed
+			dev.MaxLinkSpeed = tt.maxSpeed
+
+			applyPCIeLinkSpeedWarning(&dev)
+
+			gotWarn := dev.Status != nil && *dev.Status == statusWarning
+			if gotWarn != tt.wantWarning {
+				t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
+			}
+			if tt.wantWarning {
+				if dev.ErrorDescription == nil {
+					t.Fatal("expected ErrorDescription to be set")
+				}
+				if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
+					t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
+				}
+			} else {
+				if dev.ErrorDescription != nil {
+					t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
+				}
+			}
+		})
+	}
+}
--- a/audit/internal/collector/storage.go
+++ b/audit/internal/collector/storage.go
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
 		if dev.Type != "disk" {
 			continue
 		}
+		if isVirtualBMCDisk(dev) {
+			slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
+			continue
+		}
 		disks = append(disks, dev)
 	}
 	return disks
 }

+// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
+// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
+// These have zero reported size, a generic fake serial, and a model name that
+// starts with "Virtual HDisk".
+func isVirtualBMCDisk(dev lsblkDevice) bool {
+	return isVirtualHDiskModel(dev.Model)
+}
+
+func isVirtualHDiskModel(model string) bool {
+	model = strings.ToLower(strings.TrimSpace(model))
+	return strings.HasPrefix(model, "virtual hdisk")
+}
+
 func lsblkDevices() []lsblkDevice {
 	out, err := exec.Command("lsblk", "-J", "-d",
 		"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -0,0 +1,360 @@
+package platform
+
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
+	return renderBenchmarkReportWithCharts(result)
+}
+
+func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+
+	// ── Header ────────────────────────────────────────────────────────────────
+	b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
+
+	// System identity block
+	if result.ServerModel != "" {
+		fmt.Fprintf(&b, "**Server:** %s  \n", result.ServerModel)
+	}
+	if result.Hostname != "" {
+		fmt.Fprintf(&b, "**Host:** %s  \n", result.Hostname)
+	}
+	// GPU models summary
+	if len(result.GPUs) > 0 {
+		modelCount := make(map[string]int)
+		var modelOrder []string
+		for _, g := range result.GPUs {
+			m := strings.TrimSpace(g.Name)
+			if m == "" {
+				m = "Unknown GPU"
+			}
+			if modelCount[m] == 0 {
+				modelOrder = append(modelOrder, m)
+			}
+			modelCount[m]++
+		}
+		var parts []string
+		for _, m := range modelOrder {
+			if modelCount[m] == 1 {
+				parts = append(parts, m)
+			} else {
+				parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
+			}
+		}
+		fmt.Fprintf(&b, "**GPU(s):** %s  \n", strings.Join(parts, ", "))
+	}
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	if result.RampStep > 0 && result.RampTotal > 0 {
+		fmt.Fprintf(&b, "**Ramp-up step:** %d of %d  \n", result.RampStep, result.RampTotal)
+		if result.RampRunID != "" {
+			fmt.Fprintf(&b, "**Ramp-up run ID:** %s  \n", result.RampRunID)
+		}
+	} else if result.ParallelGPUs {
+		fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously)  \n")
+	}
+	if result.ScalabilityScore > 0 {
+		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
+	}
+	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
+	b.WriteString("\n")
+
+	// ── Executive Summary ─────────────────────────────────────────────────────
+	if len(result.Findings) > 0 {
+		b.WriteString("## Executive Summary\n\n")
+		for _, finding := range result.Findings {
+			fmt.Fprintf(&b, "- %s\n", finding)
+		}
+		b.WriteString("\n")
+	}
+
+	if len(result.Warnings) > 0 {
+		b.WriteString("## Warnings\n\n")
+		for _, warning := range result.Warnings {
+			fmt.Fprintf(&b, "- %s\n", warning)
+		}
+		b.WriteString("\n")
+	}
+
+	// ── Methodology ───────────────────────────────────────────────────────────
+	b.WriteString("## Methodology\n\n")
+	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
+	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
+	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
+	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
+	b.WriteString("**Compute score** is derived from two phases:\n\n")
+	b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
+	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
+	b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
+	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
+	b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
+	b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
+	b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
+	b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
+	b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
+	b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
+
+	// ── Scorecard table ───────────────────────────────────────────────────────
+	b.WriteString("## Scorecard\n\n")
+	b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
+	b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
+	for _, gpu := range result.GPUs {
+		name := strings.TrimSpace(gpu.Name)
+		if name == "" {
+			name = "Unknown GPU"
+		}
+		interconnect := "-"
+		if gpu.Scores.InterconnectScore > 0 {
+			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
+		}
+		topsPerSM := "-"
+		if gpu.Scores.TOPSPerSMPerGHz > 0 {
+			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
+		}
+		synthetic := "-"
+		if gpu.Scores.SyntheticScore > 0 {
+			synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
+		}
+		mixed := "-"
+		if gpu.Scores.MixedScore > 0 {
+			mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
+		}
+		mixedEff := "-"
+		if gpu.Scores.MixedEfficiency > 0 {
+			mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
+		}
+		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
+			gpu.Index, name,
+			gpu.Status,
+			gpu.Scores.CompositeScore,
+			gpu.Scores.ComputeScore,
+			synthetic,
+			mixed,
+			mixedEff,
+			topsPerSM,
+			gpu.Scores.PowerSustainScore,
+			gpu.Scores.ThermalSustainScore,
+			gpu.Scores.StabilityScore,
+			interconnect,
+		)
+	}
+	b.WriteString("\n")
+
+	// ── Per GPU detail ────────────────────────────────────────────────────────
+	b.WriteString("## Per-GPU Details\n\n")
+	for _, gpu := range result.GPUs {
+		name := strings.TrimSpace(gpu.Name)
+		if name == "" {
+			name = "Unknown GPU"
+		}
+		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
+
+		// Identity
+		if gpu.BusID != "" {
+			fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
+		}
+		if gpu.VBIOS != "" {
+			fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
+		}
+		if gpu.ComputeCapability != "" {
+			fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
+		}
+		if gpu.MultiprocessorCount > 0 {
+			fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
+		}
+		if gpu.PowerLimitW > 0 {
+			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
+		}
+		if gpu.LockedGraphicsClockMHz > 0 {
+			fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
+		}
+		b.WriteString("\n")
+
+		// Steady-state telemetry
+		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
+		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
+		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
+		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
+		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
+		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+		b.WriteString("\n")
+
+		// Per-precision stability phases.
+		if len(gpu.PrecisionSteady) > 0 {
+			b.WriteString("**Per-precision stability:**\n\n")
+			b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
+			for _, p := range gpu.PrecisionSteady {
+				eccCorr := "—"
+				eccUncorr := "—"
+				if !p.ECC.IsZero() {
+					eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
+					eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
+				}
+				fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
+					p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
+					eccCorr, eccUncorr)
+			}
+			b.WriteString("\n")
+		} else {
+			// Legacy: show combined-window variance.
+			fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
+				gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
+		}
+
+		// ECC summary
+		if !gpu.ECC.IsZero() {
+			fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
+				gpu.ECC.Corrected, gpu.ECC.Uncorrected)
+		}
+
+		// Throttle
+		throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
+		if throttle != "none" {
+			fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
+		}
+
+		// Precision results
+		if len(gpu.PrecisionResults) > 0 {
+			b.WriteString("**Precision results:**\n\n")
+			b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
+			for _, p := range gpu.PrecisionResults {
+				if p.Supported {
+					weightStr := fmt.Sprintf("×%.3g", p.Weight)
+					fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
+						p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
+				} else {
+					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
+				}
+			}
+			b.WriteString("\n")
+		}
+
+		// Degradation / Notes
+		if len(gpu.DegradationReasons) > 0 {
+			fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
+		}
+		if len(gpu.Notes) > 0 {
+			b.WriteString("**Notes:**\n\n")
+			for _, note := range gpu.Notes {
+				fmt.Fprintf(&b, "- %s\n", note)
+			}
+			b.WriteString("\n")
+		}
+	}
+
+	// ── Interconnect ──────────────────────────────────────────────────────────
+	if result.Interconnect != nil {
+		b.WriteString("## Interconnect (NCCL)\n\n")
+		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
+		if result.Interconnect.Supported {
+			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
+			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
+			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString("\n")
+		}
+		for _, note := range result.Interconnect.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		if len(result.Interconnect.Notes) > 0 {
+			b.WriteString("\n")
+		}
+	}
+
+	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	if sp := result.ServerPower; sp != nil {
+		b.WriteString("## Server Power (IPMI)\n\n")
+		if !sp.Available {
+			b.WriteString("IPMI power measurement unavailable.\n\n")
+		} else {
+			b.WriteString("| | Value |\n|---|---|\n")
+			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
+			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
+			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
+			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
+			if sp.ReportingRatio > 0 {
+				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
+			}
+			b.WriteString("\n")
+		}
+		for _, note := range sp.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		if len(sp.Notes) > 0 {
+			b.WriteString("\n")
+		}
+	}
+
+	// ── Raw files ─────────────────────────────────────────────────────────────
+	b.WriteString("## Raw Files\n\n")
+	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
+	b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
+	if result.Interconnect != nil {
+		b.WriteString("- `nccl-all-reduce.log`\n")
+	}
+	return b.String()
+}
+
+// formatThrottleLine renders throttle counters as human-readable percentages of
+// the steady-state window.  Only non-zero counters are shown.  When the steady
+// duration is unknown (0), raw seconds are shown instead.
+func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
+	type counter struct {
+		label string
+		us    uint64
+	}
+	counters := []counter{
+		{"sw_power", t.SWPowerCapUS},
+		{"sw_thermal", t.SWThermalSlowdownUS},
+		{"sync_boost", t.SyncBoostUS},
+		{"hw_thermal", t.HWThermalSlowdownUS},
+		{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
+	}
+	var parts []string
+	for _, c := range counters {
+		if c.us == 0 {
+			continue
+		}
+		sec := float64(c.us) / 1e6
+		if steadyDurationSec > 0 {
+			pct := sec / steadyDurationSec * 100
+			parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
+		} else if sec < 1 {
+			parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
+		} else {
+			parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
+		}
+	}
+	if len(parts) == 0 {
+		return "none"
+	}
+	return strings.Join(parts, "  ")
+}
+
+func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
+	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
+	fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
+	var best float64
+	for i, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
+		fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
+		if i == 0 || gpu.Scores.CompositeScore > best {
+			best = gpu.Scores.CompositeScore
+		}
+	}
+	fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
+		fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
+	}
+	return b.String()
+}
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -0,0 +1,235 @@
+package platform
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestResolveBenchmarkProfile(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name    string
+		profile string
+		want    benchmarkProfileSpec
+	}{
+		{
+			name:    "default",
+			profile: "",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
+		},
+		{
+			name:    "stability",
+			profile: "stability",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
+		},
+		{
+			name:    "overnight",
+			profile: "overnight",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got := resolveBenchmarkProfile(tc.profile)
+			if got != tc.want {
+				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
+	t.Parallel()
+
+	opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
+		Profile: "stability",
+		RunNCCL: false,
+	})
+	if opts.Profile != NvidiaBenchmarkProfileStability {
+		t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
+	}
+	if opts.RunNCCL {
+		t.Fatalf("RunNCCL should stay false when explicitly disabled")
+	}
+}
+
+func TestParseBenchmarkBurnLog(t *testing.T) {
+	t.Parallel()
+
+	raw := strings.Join([]string{
+		"loader=bee-gpu-burn",
+		"[gpu 0] device=NVIDIA H100",
+		"[gpu 0] compute_capability=9.0",
+		"[gpu 0] backend=cublasLt",
+		"[gpu 0] duration_s=10",
+		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
+		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
+		"[gpu 0] fp16_tensor_iterations=200",
+		"[gpu 0] fp8_e4m3_iterations=50",
+		"[gpu 0] status=OK",
+	}, "\n")
+
+	got := parseBenchmarkBurnLog(raw)
+	if got.Backend != "cublasLt" {
+		t.Fatalf("backend=%q want cublasLt", got.Backend)
+	}
+	if got.ComputeCapability != "9.0" {
+		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
+	}
+	if len(got.Profiles) != 2 {
+		t.Fatalf("profiles=%d want 2", len(got.Profiles))
+	}
+	if got.Profiles[0].TeraOpsPerSec <= 0 {
+		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
+	}
+	if got.Profiles[1].Category != "fp8" {
+		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
+	}
+}
+
+func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
+	t.Parallel()
+
+	result := NvidiaBenchmarkResult{
+		BenchmarkVersion:   benchmarkVersion,
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "PARTIAL",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "partial",
+		},
+		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
+		GPUs: []BenchmarkGPUResult{
+			{
+				Index:  0,
+				Name:   "NVIDIA H100",
+				Status: "OK",
+				Steady: BenchmarkTelemetrySummary{
+					AvgPowerW:           680,
+					AvgTempC:            79,
+					AvgGraphicsClockMHz: 1725,
+					P95PowerW:           700,
+					P95TempC:            82,
+					P95GraphicsClockMHz: 1800,
+				},
+				Scores: BenchmarkScorecard{
+					ComputeScore:        1200,
+					PowerSustainScore:   96,
+					ThermalSustainScore: 88,
+					StabilityScore:      92,
+					CompositeScore:      1176,
+				},
+				PrecisionResults: []BenchmarkPrecisionResult{
+					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
+				},
+				Throttle: BenchmarkThrottleCounters{
+					SWPowerCapUS: 1000000,
+				},
+				DegradationReasons: []string{"power_capped"},
+			},
+		},
+	}
+
+	report := renderBenchmarkReport(result)
+	for _, needle := range []string{
+		"Executive Summary",
+		"GPU 0 spent measurable time under SW power cap.",
+		"1176.00",
+		"fp16_tensor",
+		"700.00",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+}
+
+func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
+	t.Parallel()
+
+	report := renderBenchmarkReport(NvidiaBenchmarkResult{
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "OK",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "full",
+		},
+	})
+
+	for _, needle := range []string{
+		"gpu-metrics.csv",
+		"gpu-metrics.html",
+		"gpu-burn.log",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+}
+
+func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+	t.Parallel()
+
+	nvsmiQ := []byte(`
+GPU 00000000:4E:00.0
+    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Clocks
+        Graphics                          : 2422 MHz
+        Memory                            : 12481 MHz
+    Max Clocks
+        Graphics                          : 2430 MHz
+        SM                                : 2430 MHz
+        Memory                            : 12481 MHz
+        Video                             : 2107 MHz
+
+GPU 00000000:4F:00.0
+    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Max Clocks
+        Graphics                          : 2430 MHz
+        Memory                            : 12481 MHz
+`)
+
+	infoByIndex := map[int]benchmarkGPUInfo{
+		0: {Index: 0, BusID: "00000000:4E:00.0"},
+		1: {Index: 1, BusID: "00000000:4F:00.0"},
+	}
+
+	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+
+	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
+		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
+	}
+	if infoByIndex[0].MaxMemoryClockMHz != 12481 {
+		t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
+	}
+	if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
+		t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
+	}
+	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
+		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
+	}
+}
+
+func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+	t.Parallel()
+
+	nvsmiQ := []byte(`
+GPU 00000000:4E:00.0
+    Max Clocks
+        Graphics                          : 9999 MHz
+        Memory                            : 9999 MHz
+`)
+	// Already populated — must not be overwritten.
+	infoByIndex := map[int]benchmarkGPUInfo{
+		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+	}
+
+	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+
+	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
+		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
+	}
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -0,0 +1,235 @@
+package platform
+
+import "time"
+
+// BenchmarkHostConfig holds static CPU and memory configuration captured at
+// benchmark start. Useful for correlating results across runs on different hardware.
+type BenchmarkHostConfig struct {
+	CPUModel    string  `json:"cpu_model,omitempty"`
+	CPUSockets  int     `json:"cpu_sockets,omitempty"`
+	CPUCores    int     `json:"cpu_cores,omitempty"`
+	CPUThreads  int     `json:"cpu_threads,omitempty"`
+	MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
+}
+
+// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
+// steady-state phase. High or unstable CPU load during a GPU benchmark may
+// indicate a competing workload or a CPU-bound driver bottleneck.
+type BenchmarkCPULoad struct {
+	AvgPct  float64 `json:"avg_pct"`
+	MaxPct  float64 `json:"max_pct"`
+	P95Pct  float64 `json:"p95_pct"`
+	Samples int     `json:"samples"`
+	// Status is "ok", "high", or "unstable".
+	Status string `json:"status"`
+	Note   string `json:"note,omitempty"`
+}
+
+const (
+	NvidiaBenchmarkProfileStandard  = "standard"
+	NvidiaBenchmarkProfileStability = "stability"
+	NvidiaBenchmarkProfileOvernight = "overnight"
+)
+
+type NvidiaBenchmarkOptions struct {
+	Profile           string
+	SizeMB            int
+	GPUIndices        []int
+	ExcludeGPUIndices []int
+	RunNCCL           bool
+	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
+	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
+	RampTotal         int    // total number of ramp-up steps in this run
+	RampRunID         string // shared identifier across all steps of the same ramp-up run
+}
+
+type NvidiaBenchmarkResult struct {
+	BenchmarkVersion   string                       `json:"benchmark_version"`
+	GeneratedAt        time.Time                    `json:"generated_at"`
+	Hostname           string                       `json:"hostname,omitempty"`
+	ServerModel        string                       `json:"server_model,omitempty"`
+	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
+	RampStep           int                          `json:"ramp_step,omitempty"`
+	RampTotal          int                          `json:"ramp_total,omitempty"`
+	RampRunID          string                       `json:"ramp_run_id,omitempty"`
+	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
+	OverallStatus      string                       `json:"overall_status"`
+	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
+	Findings           []string                     `json:"findings,omitempty"`
+	Warnings           []string                     `json:"warnings,omitempty"`
+	Normalization      BenchmarkNormalization       `json:"normalization"`
+	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	GPUs               []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+}
+
+type BenchmarkNormalization struct {
+	Status string                      `json:"status"`
+	Notes  []string                    `json:"notes,omitempty"`
+	GPUs   []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
+}
+
+type BenchmarkNormalizationGPU struct {
+	Index                 int      `json:"index"`
+	PersistenceMode       string   `json:"persistence_mode,omitempty"`
+	GPUClockLockMHz       float64  `json:"gpu_clock_lock_mhz,omitempty"`
+	GPUClockLockStatus    string   `json:"gpu_clock_lock_status,omitempty"`
+	MemoryClockLockMHz    float64  `json:"memory_clock_lock_mhz,omitempty"`
+	MemoryClockLockStatus string   `json:"memory_clock_lock_status,omitempty"`
+	Notes                 []string `json:"notes,omitempty"`
+}
+
+type BenchmarkGPUResult struct {
+	Index               int     `json:"index"`
+	UUID                string  `json:"uuid,omitempty"`
+	Name                string  `json:"name,omitempty"`
+	BusID               string  `json:"bus_id,omitempty"`
+	VBIOS               string  `json:"vbios,omitempty"`
+	ComputeCapability   string  `json:"compute_capability,omitempty"`
+	Backend             string  `json:"backend,omitempty"`
+	Status              string  `json:"status"`
+	PowerLimitW         float64 `json:"power_limit_w,omitempty"`
+	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
+	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
+	// CalibratedPeakPowerW is the p95 power measured during a short
+	// dcgmi targeted_power calibration run before the main benchmark.
+	// Used as the reference denominator for PowerSustainScore instead of
+	// the hardware default limit, which bee-gpu-burn cannot reach.
+	CalibratedPeakPowerW   float64                         `json:"calibrated_peak_power_w,omitempty"`
+	MaxGraphicsClockMHz    float64                         `json:"max_graphics_clock_mhz,omitempty"`
+	BaseGraphicsClockMHz   float64                         `json:"base_graphics_clock_mhz,omitempty"`
+	MaxMemoryClockMHz      float64                         `json:"max_memory_clock_mhz,omitempty"`
+	LockedGraphicsClockMHz float64                         `json:"locked_graphics_clock_mhz,omitempty"`
+	LockedMemoryClockMHz   float64                         `json:"locked_memory_clock_mhz,omitempty"`
+	Baseline               BenchmarkTelemetrySummary       `json:"baseline"`
+	Steady                 BenchmarkTelemetrySummary       `json:"steady"`
+	PrecisionSteady        []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
+	Cooldown               BenchmarkTelemetrySummary       `json:"cooldown"`
+	Throttle               BenchmarkThrottleCounters       `json:"throttle_counters"`
+	// ECC error delta accumulated over the full benchmark (all phases combined).
+	ECC                BenchmarkECCCounters       `json:"ecc,omitempty"`
+	PrecisionResults   []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
+	Scores             BenchmarkScorecard         `json:"scores"`
+	DegradationReasons []string                   `json:"degradation_reasons,omitempty"`
+	Notes              []string                   `json:"notes,omitempty"`
+}
+
+type BenchmarkTelemetrySummary struct {
+	DurationSec         float64 `json:"duration_sec"`
+	Samples             int     `json:"samples"`
+	AvgTempC            float64 `json:"avg_temp_c"`
+	P95TempC            float64 `json:"p95_temp_c"`
+	AvgPowerW           float64 `json:"avg_power_w"`
+	P95PowerW           float64 `json:"p95_power_w"`
+	AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
+	P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
+	AvgMemoryClockMHz   float64 `json:"avg_memory_clock_mhz"`
+	P95MemoryClockMHz   float64 `json:"p95_memory_clock_mhz"`
+	AvgUsagePct         float64 `json:"avg_usage_pct"`
+	AvgMemUsagePct      float64 `json:"avg_mem_usage_pct"`
+	ClockCVPct          float64 `json:"clock_cv_pct"`
+	PowerCVPct          float64 `json:"power_cv_pct"`
+	TempCVPct           float64 `json:"temp_cv_pct"`
+	ClockDriftPct       float64 `json:"clock_drift_pct"`
+}
+
+type BenchmarkThrottleCounters struct {
+	SWPowerCapUS           uint64 `json:"sw_power_cap_us"`
+	SWThermalSlowdownUS    uint64 `json:"sw_thermal_slowdown_us"`
+	SyncBoostUS            uint64 `json:"sync_boost_us"`
+	HWThermalSlowdownUS    uint64 `json:"hw_thermal_slowdown_us"`
+	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
+}
+
+// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
+// Corrected = single-bit errors fixed by ECC (DRAM degradation).
+// Uncorrected = double-bit errors that could not be corrected (serious fault).
+// Both are volatile (since last driver reset), not persistent.
+type BenchmarkECCCounters struct {
+	Corrected   uint64 `json:"corrected"`
+	Uncorrected uint64 `json:"uncorrected"`
+}
+
+func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
+func (e BenchmarkECCCounters) IsZero() bool  { return e.Corrected == 0 && e.Uncorrected == 0 }
+
+type BenchmarkPrecisionResult struct {
+	Name          string  `json:"name"`
+	Category      string  `json:"category"`
+	Supported     bool    `json:"supported"`
+	Lanes         int     `json:"lanes,omitempty"`
+	M             uint64  `json:"m,omitempty"`
+	N             uint64  `json:"n,omitempty"`
+	K             uint64  `json:"k,omitempty"`
+	Iterations    uint64  `json:"iterations,omitempty"`
+	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
+	// Weight is the fp32-equivalence factor for this precision category.
+	// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
+	// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
+	Weight                float64 `json:"weight,omitempty"`
+	WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
+	Notes                 string  `json:"notes,omitempty"`
+}
+
+type BenchmarkScorecard struct {
+	ComputeScore float64 `json:"compute_score"`
+	// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
+	// steady phases (each precision ran alone, full GPU dedicated).
+	SyntheticScore float64 `json:"synthetic_score,omitempty"`
+	// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
+	// (all precisions competing simultaneously — closer to real workloads).
+	MixedScore float64 `json:"mixed_score,omitempty"`
+	// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
+	// sustains throughput under concurrent mixed-precision load.
+	MixedEfficiency     float64 `json:"mixed_efficiency,omitempty"`
+	PowerSustainScore   float64 `json:"power_sustain_score"`
+	ThermalSustainScore float64 `json:"thermal_sustain_score"`
+	StabilityScore      float64 `json:"stability_score"`
+	InterconnectScore   float64 `json:"interconnect_score"`
+	CompositeScore      float64 `json:"composite_score"`
+	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
+	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
+// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
+// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
+// over-reporting its power consumption.
+type BenchmarkServerPower struct {
+	Available       bool     `json:"available"`
+	IdleW           float64  `json:"idle_w,omitempty"`
+	LoadedW         float64  `json:"loaded_w,omitempty"`
+	DeltaW          float64  `json:"delta_w,omitempty"`
+	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
+	Notes           []string `json:"notes,omitempty"`
+}
+
+// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
+// during a dedicated single-precision steady window.  Because only one kernel
+// type runs at a time the PowerCVPct here is a genuine stability signal.
+type BenchmarkPrecisionSteadyPhase struct {
+	Precision             string                    `json:"precision"` // e.g. "fp8", "fp16", "fp32"
+	Steady                BenchmarkTelemetrySummary `json:"steady"`
+	TeraOpsPerSec         float64                   `json:"teraops_per_sec,omitempty"`
+	WeightedTeraOpsPerSec float64                   `json:"weighted_teraops_per_sec,omitempty"`
+	// ECC errors accumulated during this precision phase only.
+	// Non-zero corrected = stress-induced DRAM errors for this kernel type.
+	// Any uncorrected = serious fault triggered by this precision workload.
+	ECC BenchmarkECCCounters `json:"ecc,omitempty"`
+}
+
+type BenchmarkInterconnectResult struct {
+	Status             string   `json:"status"`
+	Attempted          bool     `json:"attempted"`
+	Supported          bool     `json:"supported"`
+	SelectedGPUIndices []int    `json:"selected_gpu_indices,omitempty"`
+	AvgAlgBWGBps       float64  `json:"avg_algbw_gbps,omitempty"`
+	MaxAlgBWGBps       float64  `json:"max_algbw_gbps,omitempty"`
+	AvgBusBWGBps       float64  `json:"avg_busbw_gbps,omitempty"`
+	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
+	Notes              []string `json:"notes,omitempty"`
+}
--- a/audit/internal/platform/error_patterns.go
+++ b/audit/internal/platform/error_patterns.go
@@ -0,0 +1,139 @@
+package platform
+
+import "regexp"
+
+// ErrorPattern describes a kernel log pattern that indicates a hardware error.
+// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
+type ErrorPattern struct {
+	// Name is a short machine-readable label for logging and deduplication.
+	Name string
+	// Re is the compiled regular expression matched against a single kmsg line.
+	Re *regexp.Regexp
+	// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
+	Category string
+	// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
+	Severity string
+	// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
+	// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
+	BDFGroup int
+	// DevGroup is the capture group index (1-based) that contains a device name
+	// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
+	DevGroup int
+}
+
+// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
+// To add a new pattern: append a new ErrorPattern struct to this slice.
+var HardwareErrorPatterns = []ErrorPattern{
+	// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
+	{
+		Name:     "nvidia-rminitadapter",
+		Re:       mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "nvidia-msi-fail",
+		Re:       mustPat(`(?i)NVRM:.*Failed to enable MSI`),
+		Category: "gpu",
+		Severity: "warning",
+	},
+	{
+		Name:     "nvidia-aer",
+		Re:       mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "nvidia-xid",
+		Re:       mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+
+	// ── PCIe AER (generic) ──────────────────────────────────────────────────────
+	{
+		Name:     "pcie-aer",
+		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "pcie-uncorrectable",
+		Re:       mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "pcie-link-down",
+		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+
+	// ── Storage ─────────────────────────────────────────────────────────────────
+	{
+		Name:     "blk-io-error",
+		Re:       mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+	{
+		Name:     "nvme-timeout",
+		Re:       mustPat(`(?i)nvme\s+(\w+):.*timeout`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+	{
+		Name:     "scsi-failed",
+		Re:       mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
+		Category: "storage",
+		Severity: "warning",
+	},
+	{
+		Name:     "nvme-reset",
+		Re:       mustPat(`(?i)nvme\s+(\w+):.*reset`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+
+	// ── Machine Check Exceptions ────────────────────────────────────────────────
+	{
+		Name:     "mce-hardware-error",
+		Re:       mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
+		Category: "mce",
+		Severity: "warning",
+	},
+	{
+		Name:     "mce-corrected",
+		Re:       mustPat(`(?i)mce:.*[Cc]orrected`),
+		Category: "mce",
+		Severity: "warning",
+	},
+
+	// ── Memory ─────────────────────────────────────────────────────────────────
+	{
+		Name:     "edac-ue",
+		Re:       mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
+		Category: "memory",
+		Severity: "warning",
+	},
+	{
+		Name:     "edac-ce",
+		Re:       mustPat(`(?i)EDAC.*[Cc]orrectable`),
+		Category: "memory",
+		Severity: "warning",
+	},
+}
+
+func mustPat(s string) *regexp.Regexp {
+	return regexp.MustCompile(s)
+}
--- a/audit/internal/platform/export.go
+++ b/audit/internal/platform/export.go
@@ -11,8 +11,48 @@ import (

 var exportExecCommand = exec.Command

+func formatMountTargetError(target RemovableTarget, raw string, err error) error {
+	msg := strings.TrimSpace(raw)
+	fstype := strings.ToLower(strings.TrimSpace(target.FSType))
+	if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
+		return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
+	}
+	if msg == "" {
+		return err
+	}
+	return fmt.Errorf("%s: %w", msg, err)
+}
+
+func removableTargetReadOnly(fields map[string]string) bool {
+	if fields["RO"] == "1" {
+		return true
+	}
+	switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
+	case "iso9660", "squashfs":
+		return true
+	default:
+		return false
+	}
+}
+
+func ensureWritableMountpoint(mountpoint string) error {
+	probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
+	if err != nil {
+		return fmt.Errorf("target filesystem is not writable: %w", err)
+	}
+	name := probe.Name()
+	if closeErr := probe.Close(); closeErr != nil {
+		_ = os.Remove(name)
+		return closeErr
+	}
+	if err := os.Remove(name); err != nil {
+		return err
+	}
+	return nil
+}
+
 func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
-	raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
+	raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
 	if err != nil {
 		return nil, err
 	}
@@ -36,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
 				}
 			}
 		}
-		if !removable || fields["FSTYPE"] == "" {
+		if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
 			continue
 		}

@@ -72,7 +112,7 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
 		}
 		if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
 			_ = os.Remove(mountpoint)
-			return string(raw), err
+			return "", formatMountTargetError(target, string(raw), err)
 		}
 		mountedHere = true
 		mounted = true
@@ -95,6 +135,10 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
 		}
 	}()

+	if err := ensureWritableMountpoint(mountpoint); err != nil {
+		return "", err
+	}
+
 	filename := filepath.Base(src)
 	dst = filepath.Join(mountpoint, filename)
 	data, err := os.ReadFile(src)
--- a/audit/internal/platform/export_test.go
+++ b/audit/internal/platform/export_test.go
@@ -4,12 +4,11 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"strings"
 	"testing"
 )

 func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	src := filepath.Join(tmp, "bundle.tar.gz")
 	mountpoint := filepath.Join(tmp, "mnt")
@@ -54,3 +53,60 @@ func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
 		t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
 	}
 }
+
+func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
+	tmp := t.TempDir()
+	src := filepath.Join(tmp, "bundle.tar.gz")
+	mountpoint := filepath.Join(tmp, "mnt")
+	if err := os.MkdirAll(mountpoint, 0755); err != nil {
+		t.Fatalf("mkdir mountpoint: %v", err)
+	}
+	if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
+		t.Fatalf("write src: %v", err)
+	}
+	if err := os.Chmod(mountpoint, 0555); err != nil {
+		t.Fatalf("chmod mountpoint: %v", err)
+	}
+
+	oldExec := exportExecCommand
+	exportExecCommand = func(name string, args ...string) *exec.Cmd {
+		return exec.Command("sh", "-c", "exit 0")
+	}
+	t.Cleanup(func() { exportExecCommand = oldExec })
+
+	s := &System{}
+	_, err := s.ExportFileToTarget(src, RemovableTarget{
+		Device:     "/dev/sdb1",
+		Mountpoint: mountpoint,
+	})
+	if err == nil {
+		t.Fatal("expected error for non-writable mountpoint")
+	}
+	if !strings.Contains(err.Error(), "target filesystem is not writable") {
+		t.Fatalf("err=%q want writable message", err)
+	}
+}
+
+func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
+	oldExec := exportExecCommand
+	lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
+NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
+	exportExecCommand = func(name string, args ...string) *exec.Cmd {
+		cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
+		cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
+		return cmd
+	}
+	t.Cleanup(func() { exportExecCommand = oldExec })
+
+	s := &System{}
+	targets, err := s.ListRemovableTargets()
+	if err != nil {
+		t.Fatalf("ListRemovableTargets error: %v", err)
+	}
+	if len(targets) != 1 {
+		t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
+	}
+	if got := targets[0].Device; got != "/dev/sdb1" {
+		t.Fatalf("device=%q want /dev/sdb1", got)
+	}
+}
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -13,18 +13,21 @@ import (

 // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
 type GPUMetricRow struct {
-	ElapsedSec float64
-	GPUIndex   int
-	TempC      float64
-	UsagePct   float64
-	PowerW     float64
-	ClockMHz   float64
+	Stage       string  `json:"stage,omitempty"`
+	ElapsedSec  float64 `json:"elapsed_sec"`
+	GPUIndex    int     `json:"index"`
+	TempC       float64 `json:"temp_c"`
+	UsagePct    float64 `json:"usage_pct"`
+	MemUsagePct float64 `json:"mem_usage_pct"`
+	PowerW      float64 `json:"power_w"`
+	ClockMHz    float64 `json:"clock_mhz"`
+	MemClockMHz float64 `json:"mem_clock_mhz"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
 func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	args := []string{
-		"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics",
+		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
 		"--format=csv,noheader,nounits",
 	}
 	if len(gpuIndices) > 0 {
@@ -45,16 +48,18 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			continue
 		}
 		parts := strings.Split(line, ", ")
-		if len(parts) < 5 {
+		if len(parts) < 7 {
 			continue
 		}
 		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
 		rows = append(rows, GPUMetricRow{
-			GPUIndex: idx,
-			TempC:    parseGPUFloat(parts[1]),
-			UsagePct: parseGPUFloat(parts[2]),
-			PowerW:   parseGPUFloat(parts[3]),
-			ClockMHz: parseGPUFloat(parts[4]),
+			GPUIndex:    idx,
+			TempC:       parseGPUFloat(parts[1]),
+			UsagePct:    parseGPUFloat(parts[2]),
+			MemUsagePct: parseGPUFloat(parts[3]),
+			PowerW:      parseGPUFloat(parts[4]),
+			ClockMHz:    parseGPUFloat(parts[5]),
+			MemClockMHz: parseGPUFloat(parts[6]),
 		})
 	}
 	return rows, nil
@@ -69,17 +74,88 @@ func parseGPUFloat(s string) float64 {
 	return v
 }

+// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
+func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
+	return sampleGPUMetrics(gpuIndices)
+}
+
+// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
+func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
+	out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
+	if err != nil {
+		return nil, err
+	}
+	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
+	if len(lines) < 2 {
+		return nil, fmt.Errorf("rocm-smi: insufficient output")
+	}
+
+	// Parse header to find column indices by name.
+	headers := strings.Split(lines[0], ",")
+	colIdx := func(keywords ...string) int {
+		for i, h := range headers {
+			hl := strings.ToLower(strings.TrimSpace(h))
+			for _, kw := range keywords {
+				if strings.Contains(hl, kw) {
+					return i
+				}
+			}
+		}
+		return -1
+	}
+	idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
+	idxUse := colIdx("gpu use (%)")
+	idxMem := colIdx("vram%", "memory allocated")
+	idxPow := colIdx("average graphics package power", "power (w)")
+
+	var rows []GPUMetricRow
+	for _, line := range lines[1:] {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		parts := strings.Split(line, ",")
+		idx := len(rows)
+		row := GPUMetricRow{GPUIndex: idx}
+		get := func(i int) float64 {
+			if i < 0 || i >= len(parts) {
+				return 0
+			}
+			v := strings.TrimSpace(parts[i])
+			if strings.EqualFold(v, "n/a") {
+				return 0
+			}
+			return parseGPUFloat(v)
+		}
+		row.TempC = get(idxTemp)
+		row.UsagePct = get(idxUse)
+		row.MemUsagePct = get(idxMem)
+		row.PowerW = get(idxPow)
+		rows = append(rows, row)
+	}
+	if len(rows) == 0 {
+		return nil, fmt.Errorf("rocm-smi: no GPU rows parsed")
+	}
+	return rows, nil
+}
+
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
+		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
+			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }

+type gpuMetricStageSpan struct {
+	Name  string
+	Start float64
+	End   float64
+}
+
 // WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
 func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 	// Group by GPU index preserving order.
@@ -94,9 +170,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
 	}

+	stageSpans := buildGPUMetricStageSpans(rows)
+	stageColorByName := make(map[string]string, len(stageSpans))
+	for i, span := range stageSpans {
+		stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
+	}
+
+	var legend strings.Builder
+	if len(stageSpans) > 0 {
+		legend.WriteString(`<div class="stage-legend">`)
+		for _, span := range stageSpans {
+			fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
+				stageColorByName[span.Name], gpuHTMLEscape(span.Name))
+		}
+		legend.WriteString(`</div>`)
+	}
+
 	var svgs strings.Builder
 	for _, gpuIdx := range order {
-		svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
+		svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
 		svgs.WriteString("\n")
 	}

@@ -106,21 +198,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 <meta charset="utf-8">
 <title>GPU Stress Test Metrics</title>
 <style>
-body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
-h1 { text-align: center; color: #333; margin: 0 0 8px; }
-p  { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
+:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
+*{box-sizing:border-box}
+body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
+.page{padding:24px}
+.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
+.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
+.card-body{padding:16px}
+h1{font-size:22px;margin:0 0 6px}
+p{color:var(--muted);font-size:13px;margin:0 0 16px}
+.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
+.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
+.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
+.chart-block{margin-top:16px}
 </style>
 </head><body>
+<div class="page">
+<div class="card">
+<div class="card-head">GPU Stress Test Metrics</div>
+<div class="card-body">
 <h1>GPU Stress Test Metrics</h1>
 <p>Generated %s</p>
 %s
-</body></html>`, ts, svgs.String())
+<div class="chart-block">%s</div>
+</div>
+</div>
+</div>
+</body></html>`, ts, legend.String(), svgs.String())

 	return os.WriteFile(path, []byte(html), 0644)
 }

 // drawGPUChartSVG generates a self-contained SVG chart for one GPU.
-func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
+func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
 	// Layout
 	const W, H = 960, 520
 	const plotX1 = 120 // usage axis / chart left border
@@ -130,7 +240,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	const PW = plotX2 - plotX1
 	const PH = plotY2 - plotY1
 	// Outer axes
-	const tempAxisX = 60  // temp axis line
+	const tempAxisX = 60   // temp axis line
 	const clockAxisX = 900 // clock axis line

 	colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
@@ -215,6 +325,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	}
 	b.WriteString("</g>\n")

+	// Stage backgrounds
+	for _, span := range stageSpans {
+		x1 := xv(span.Start)
+		x2 := xv(span.End)
+		if x2 < x1 {
+			x1, x2 = x2, x1
+		}
+		if x2-x1 < 1 {
+			x2 = x1 + 1
+		}
+		color := stageColorByName[span.Name]
+		fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
+			x1, plotY1, x2-x1, PH, color)
+		fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
+			x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
+	}
+
 	// Chart border
 	fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
 		` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
@@ -313,224 +440,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	return b.String()
 }

-const (
-	ansiRed    = "\033[31m"
-	ansiBlue   = "\033[34m"
-	ansiGreen  = "\033[32m"
-	ansiYellow = "\033[33m"
-	ansiReset  = "\033[0m"
-)
-
-const (
-	termChartWidth  = 70
-	termChartHeight = 12
-)
-
-// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
-// Suitable for display in the TUI screenOutput.
-func RenderGPUTerminalChart(rows []GPUMetricRow) string {
-	seen := make(map[int]bool)
-	var order []int
-	gpuMap := make(map[int][]GPUMetricRow)
-	for _, r := range rows {
-		if !seen[r.GPUIndex] {
-			seen[r.GPUIndex] = true
-			order = append(order, r.GPUIndex)
-		}
-		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
-	}
-
-	type seriesDef struct {
-		caption string
-		color   string
-		fn      func(GPUMetricRow) float64
-	}
-	defs := []seriesDef{
-		{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
-		{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
-		{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
-		{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
-	}
-
-	var b strings.Builder
-	for _, gpuIdx := range order {
-		gr := gpuMap[gpuIdx]
-		if len(gr) == 0 {
-			continue
-		}
-		tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
-		fmt.Fprintf(&b, "GPU %d — Stress Test Metrics  (%.0f seconds)\n\n", gpuIdx, tMax)
-		for _, d := range defs {
-			b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
-				termChartHeight, termChartWidth))
-			b.WriteRune('\n')
-		}
-	}
-
-	return strings.TrimRight(b.String(), "\n")
-}
-
-// renderLineChart draws a single time-series line chart using box-drawing characters.
-// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
-func renderLineChart(vals []float64, color, caption string, height, width int) string {
-	if len(vals) == 0 {
-		return caption + "\n"
-	}
-
-	mn, mx := gpuMinMax(vals)
-	if mn == mx {
-		mx = mn + 1
-	}
-
-	// Use the smaller of width or len(vals) to avoid stretching sparse data.
-	w := width
-	if len(vals) < w {
-		w = len(vals)
-	}
-	data := gpuDownsample(vals, w)
-
-	// row[i] = display row index: 0 = top = max value, height = bottom = min value.
-	row := make([]int, w)
-	for i, v := range data {
-		r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
-		if r < 0 {
-			r = 0
-		}
-		if r > height {
-			r = height
-		}
-		row[i] = r
-	}
-
-	// Fill the character grid.
-	grid := make([][]rune, height+1)
-	for i := range grid {
-		grid[i] = make([]rune, w)
-		for j := range grid[i] {
-			grid[i][j] = ' '
-		}
-	}
-	for x := 0; x < w; x++ {
-		r := row[x]
-		if x == 0 {
-			grid[r][0] = '─'
-			continue
-		}
-		p := row[x-1]
-		switch {
-		case r == p:
-			grid[r][x] = '─'
-		case r < p: // value went up (row index decreased toward top)
-			grid[r][x] = '╭'
-			grid[p][x] = '╯'
-			for y := r + 1; y < p; y++ {
-				grid[y][x] = '│'
-			}
-		default: // r > p, value went down
-			grid[p][x] = '╮'
-			grid[r][x] = '╰'
-			for y := p + 1; y < r; y++ {
-				grid[y][x] = '│'
-			}
-		}
-	}
-
-	// Y axis tick labels.
-	ticks := gpuNiceTicks(mn, mx, height/2)
-	tickAtRow := make(map[int]string)
-	labelWidth := 4
-	for _, t := range ticks {
-		r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
-		if r < 0 || r > height {
-			continue
-		}
-		s := gpuFormatTick(t)
-		tickAtRow[r] = s
-		if len(s) > labelWidth {
-			labelWidth = len(s)
-		}
-	}
-
-	var b strings.Builder
-	for r := 0; r <= height; r++ {
-		label := tickAtRow[r]
-		fmt.Fprintf(&b, "%*s", labelWidth, label)
-		switch {
-		case label != "":
-			b.WriteRune('┤')
-		case r == height:
-			b.WriteRune('┼')
-		default:
-			b.WriteRune('│')
-		}
-		b.WriteString(color)
-		b.WriteString(string(grid[r]))
-		b.WriteString(ansiReset)
-		b.WriteRune('\n')
-	}
-
-	// Bottom axis.
-	b.WriteString(strings.Repeat(" ", labelWidth))
-	b.WriteRune('└')
-	b.WriteString(strings.Repeat("─", w))
-	b.WriteRune('\n')
-
-	// Caption centered under the chart.
-	if caption != "" {
-		total := labelWidth + 1 + w
-		if pad := (total - len(caption)) / 2; pad > 0 {
-			b.WriteString(strings.Repeat(" ", pad))
-		}
-		b.WriteString(caption)
-		b.WriteRune('\n')
-	}
-
-	return b.String()
-}
-
-func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
-	v := make([]float64, len(rows))
-	for i, r := range rows {
-		v[i] = fn(r)
-	}
-	return v
-}
-
-// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
-func gpuDownsample(vals []float64, w int) []float64 {
-	n := len(vals)
-	if n == 0 {
-		return make([]float64, w)
-	}
-	result := make([]float64, w)
-	if n >= w {
-		counts := make([]int, w)
-		for i, v := range vals {
-			bucket := i * w / n
-			if bucket >= w {
-				bucket = w - 1
-			}
-			result[bucket] += v
-			counts[bucket]++
-		}
-		for i := range result {
-			if counts[i] > 0 {
-				result[i] /= float64(counts[i])
-			}
-		}
-	} else {
-		// Nearest-neighbour upsample.
-		for i := range result {
-			src := i * (n - 1) / (w - 1)
-			if src >= n {
-				src = n - 1
-			}
-			result[i] = vals[src]
-		}
-	}
-	return result
-}
-
 func gpuMinMax(vals []float64) (float64, float64) {
 	if len(vals) == 0 {
 		return 0, 1
@@ -575,3 +484,46 @@ func gpuFormatTick(v float64) string {
 	}
 	return strconv.FormatFloat(v, 'f', 1, 64)
 }
+
+var gpuMetricStagePalette = []string{
+	"#d95c5c",
+	"#2185d0",
+	"#21ba45",
+	"#f2c037",
+	"#6435c9",
+	"#00b5ad",
+	"#a5673f",
+}
+
+func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
+	var spans []gpuMetricStageSpan
+	for _, row := range rows {
+		name := strings.TrimSpace(row.Stage)
+		if name == "" {
+			name = "run"
+		}
+		if len(spans) == 0 || spans[len(spans)-1].Name != name {
+			spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
+			continue
+		}
+		spans[len(spans)-1].End = row.ElapsedSec
+	}
+	for i := range spans {
+		if spans[i].End <= spans[i].Start {
+			spans[i].End = spans[i].Start + 1
+		}
+	}
+	return spans
+}
+
+var gpuHTMLReplacer = strings.NewReplacer(
+	"&", "&amp;",
+	"<", "&lt;",
+	">", "&gt;",
+	`"`, "&quot;",
+	"'", "&#39;",
+)
+
+func gpuHTMLEscape(s string) string {
+	return gpuHTMLReplacer.Replace(s)
+}
--- a/audit/internal/platform/gpu_metrics_test.go
+++ b/audit/internal/platform/gpu_metrics_test.go
@@ -0,0 +1,65 @@
+package platform
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	path := filepath.Join(dir, "gpu-metrics.csv")
+	rows := []GPUMetricRow{
+		{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
+	}
+	if err := WriteGPUMetricsCSV(path, rows); err != nil {
+		t.Fatalf("WriteGPUMetricsCSV: %v", err)
+	}
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	text := string(raw)
+	for _, needle := range []string{
+		"stage,elapsed_sec,gpu_index",
+		`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
+	} {
+		if !strings.Contains(text, needle) {
+			t.Fatalf("csv missing %q\n%s", needle, text)
+		}
+	}
+}
+
+func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	path := filepath.Join(dir, "gpu-metrics.html")
+	rows := []GPUMetricRow{
+		{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
+		{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
+		{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
+		{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
+	}
+	if err := WriteGPUMetricsHTML(path, rows); err != nil {
+		t.Fatalf("WriteGPUMetricsHTML: %v", err)
+	}
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	text := string(raw)
+	for _, needle := range []string{
+		"stage-legend",
+		"baseline",
+		"steady-fp16",
+		"GPU Stress Test Metrics",
+	} {
+		if !strings.Contains(text, needle) {
+			t.Fatalf("html missing %q\n%s", needle, text)
+		}
+	}
+}
--- a/audit/internal/platform/install.go
+++ b/audit/internal/platform/install.go
@@ -0,0 +1,269 @@
+package platform
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+)
+
+// InstallDisk describes a candidate disk for installation.
+type InstallDisk struct {
+	Device       string // e.g. /dev/sda
+	Model        string
+	Size         string   // human-readable, e.g. "500G"
+	SizeBytes    int64    // raw byte count from lsblk
+	MountedParts []string // partition mount points currently active
+}
+
+const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
+
+// ListInstallDisks returns block devices suitable for installation.
+// Excludes the current live boot medium but includes USB drives.
+func (s *System) ListInstallDisks() ([]InstallDisk, error) {
+	out, err := exec.Command("lsblk", "-dn", "-o", "NAME,MODEL,SIZE,TYPE,TRAN").Output()
+	if err != nil {
+		return nil, fmt.Errorf("lsblk: %w", err)
+	}
+
+	bootDev := findLiveBootDevice()
+
+	var disks []InstallDisk
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		fields := strings.Fields(line)
+		// NAME MODEL SIZE TYPE TRAN  — model may have spaces so we parse from end
+		if len(fields) < 4 {
+			continue
+		}
+		// Last field: TRAN, second-to-last: TYPE, third-to-last: SIZE
+		typ := fields[len(fields)-2]
+		size := fields[len(fields)-3]
+		name := fields[0]
+		model := strings.Join(fields[1:len(fields)-3], " ")
+
+		if typ != "disk" {
+			continue
+		}
+
+		device := "/dev/" + name
+		if device == bootDev {
+			continue
+		}
+
+		sizeBytes := diskSizeBytes(device)
+		mounted := mountedParts(device)
+
+		disks = append(disks, InstallDisk{
+			Device:       device,
+			Model:        strings.TrimSpace(model),
+			Size:         size,
+			SizeBytes:    sizeBytes,
+			MountedParts: mounted,
+		})
+	}
+	return disks, nil
+}
+
+// diskSizeBytes returns the byte size of a block device using lsblk.
+func diskSizeBytes(device string) int64 {
+	out, err := exec.Command("lsblk", "-bdn", "-o", "SIZE", device).Output()
+	if err != nil {
+		return 0
+	}
+	n, _ := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64)
+	return n
+}
+
+// mountedParts returns a list of "<part> at <mountpoint>" strings for any
+// mounted partitions on the given device.
+func mountedParts(device string) []string {
+	out, err := exec.Command("lsblk", "-n", "-o", "NAME,MOUNTPOINT", device).Output()
+	if err != nil {
+		return nil
+	}
+	var result []string
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		fields := strings.Fields(line)
+		if len(fields) < 2 {
+			continue
+		}
+		mp := fields[1]
+		if mp == "" || mp == "[SWAP]" {
+			continue
+		}
+		result = append(result, "/dev/"+strings.TrimLeft(fields[0], "└─├─")+" at "+mp)
+	}
+	return result
+}
+
+// findLiveBootDevice returns the block device backing /run/live/medium (if any).
+func findLiveBootDevice() string {
+	out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", "/run/live/medium").Output()
+	if err != nil {
+		return ""
+	}
+	src := strings.TrimSpace(string(out))
+	if src == "" {
+		return ""
+	}
+	// Strip partition suffix to get the whole disk device.
+	// e.g. /dev/sdb1 → /dev/sdb,  /dev/nvme0n1p1 → /dev/nvme0n1
+	out2, err := exec.Command("lsblk", "-no", "PKNAME", src).Output()
+	if err != nil || strings.TrimSpace(string(out2)) == "" {
+		return src
+	}
+	return "/dev/" + strings.TrimSpace(string(out2))
+}
+
+func mountSource(target string) string {
+	out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func mountFSType(target string) string {
+	out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func blockDeviceType(device string) string {
+	if strings.TrimSpace(device) == "" {
+		return ""
+	}
+	out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func blockDeviceTransport(device string) string {
+	if strings.TrimSpace(device) == "" {
+		return ""
+	}
+	out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func inferLiveBootKind(fsType, source, deviceType, transport string) string {
+	switch {
+	case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
+		return "ram"
+	case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
+		return "cdrom"
+	case strings.EqualFold(strings.TrimSpace(transport), "usb"):
+		return "usb"
+	case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
+		return "cdrom"
+	case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
+		return "disk"
+	default:
+		return "unknown"
+	}
+}
+
+// MinInstallBytes returns the minimum recommended disk size for installation:
+// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
+// Returns 0 if the squashfs is not available (non-live environment).
+func MinInstallBytes() int64 {
+	fi, err := os.Stat(squashfsPath)
+	if err != nil {
+		return 0
+	}
+	return fi.Size() * 3 / 2
+}
+
+// toramActive returns true when the live system was booted with toram.
+func toramActive() bool {
+	data, err := os.ReadFile("/proc/cmdline")
+	if err != nil {
+		return false
+	}
+	return strings.Contains(string(data), "toram")
+}
+
+// freeMemBytes returns MemAvailable from /proc/meminfo.
+func freeMemBytes() int64 {
+	data, err := os.ReadFile("/proc/meminfo")
+	if err != nil {
+		return 0
+	}
+	for _, line := range strings.Split(string(data), "\n") {
+		if strings.HasPrefix(line, "MemAvailable:") {
+			fields := strings.Fields(line)
+			if len(fields) >= 2 {
+				n, _ := strconv.ParseInt(fields[1], 10, 64)
+				return n * 1024 // kB → bytes
+			}
+		}
+	}
+	return 0
+}
+
+// DiskWarnings returns advisory warning strings for a disk candidate.
+func DiskWarnings(d InstallDisk) []string {
+	var w []string
+	if len(d.MountedParts) > 0 {
+		w = append(w, "has mounted partitions: "+strings.Join(d.MountedParts, ", "))
+	}
+	min := MinInstallBytes()
+	if min > 0 && d.SizeBytes > 0 && d.SizeBytes < min {
+		w = append(w, fmt.Sprintf("disk may be too small (need ≥ %s, have %s)",
+			humanBytes(min), humanBytes(d.SizeBytes)))
+	}
+	if toramActive() {
+		sqFi, err := os.Stat(squashfsPath)
+		if err == nil {
+			free := freeMemBytes()
+			if free > 0 && free < sqFi.Size()*2 {
+				w = append(w, "toram mode — low RAM, extraction may be slow or fail")
+			}
+		}
+	}
+	return w
+}
+
+func humanBytes(b int64) string {
+	const unit = 1024
+	if b < unit {
+		return fmt.Sprintf("%d B", b)
+	}
+	div, exp := int64(unit), 0
+	for n := b / unit; n >= unit; n /= unit {
+		div *= unit
+		exp++
+	}
+	return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
+}
+
+// InstallToDisk runs bee-install <device> <logfile> and streams output to logFile.
+// The context can be used to cancel.
+func (s *System) InstallToDisk(ctx context.Context, device string, logFile string) error {
+	cmd := exec.CommandContext(ctx, "bee-install", device, logFile)
+	return cmd.Run()
+}
+
+// InstallLogPath returns the default install log path for a given device.
+func InstallLogPath(device string) string {
+	safe := strings.NewReplacer("/", "_", " ", "_").Replace(device)
+	return "/tmp/bee-install" + safe + ".log"
+}
+
+// Label returns a display label for a disk.
+func (d InstallDisk) Label() string {
+	model := d.Model
+	if model == "" {
+		model = "Unknown"
+	}
+	return fmt.Sprintf("%s  %s  %s", d.Device, d.Size, model)
+}
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -0,0 +1,309 @@
+package platform
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+)
+
+func (s *System) IsLiveMediaInRAM() bool {
+	fsType := mountFSType("/run/live/medium")
+	if fsType == "" {
+		// No medium mount at all — fall back to toram kernel parameter.
+		return toramActive()
+	}
+	if strings.EqualFold(fsType, "tmpfs") {
+		return true
+	}
+	// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
+	// mount of /run/live/medium fails (common for CD-ROM boots), the medium
+	// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
+	files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
+	return len(files) > 0
+}
+
+func (s *System) LiveBootSource() LiveBootSource {
+	fsType := mountFSType("/run/live/medium")
+	source := mountSource("/run/live/medium")
+	device := findLiveBootDevice()
+	status := LiveBootSource{
+		InRAM:  strings.EqualFold(fsType, "tmpfs"),
+		Source: source,
+		Device: device,
+	}
+	if fsType == "" && source == "" && device == "" {
+		if toramActive() {
+			status.InRAM = true
+			status.Kind = "ram"
+			status.Source = "tmpfs"
+			return status
+		}
+		status.Kind = "unknown"
+		return status
+	}
+	status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
+	if status.Kind == "" {
+		status.Kind = "unknown"
+	}
+	if status.InRAM && strings.TrimSpace(status.Source) == "" {
+		status.Source = "tmpfs"
+	}
+	return status
+}
+
+func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
+	log := func(msg string) {
+		if logFunc != nil {
+			logFunc(msg)
+		}
+	}
+
+	if s.IsLiveMediaInRAM() {
+		log("Already running from RAM — installation media can be safely disconnected.")
+		return nil
+	}
+
+	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
+	if err != nil || len(squashfsFiles) == 0 {
+		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
+	}
+
+	free := freeMemBytes()
+	var needed int64
+	for _, sf := range squashfsFiles {
+		fi, err2 := os.Stat(sf)
+		if err2 != nil {
+			return fmt.Errorf("stat %s: %v", sf, err2)
+		}
+		needed += fi.Size()
+	}
+	const headroom = 256 * 1024 * 1024
+	if free > 0 && needed+headroom > free {
+		return fmt.Errorf("insufficient RAM: need %s, available %s",
+			humanBytes(needed+headroom), humanBytes(free))
+	}
+
+	dstDir := "/dev/shm/bee-live"
+	if err := os.MkdirAll(dstDir, 0755); err != nil {
+		return fmt.Errorf("create tmpfs dir: %v", err)
+	}
+
+	for _, sf := range squashfsFiles {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		base := filepath.Base(sf)
+		dst := filepath.Join(dstDir, base)
+		log(fmt.Sprintf("Copying %s to RAM...", base))
+		if err := copyFileLarge(ctx, sf, dst, log); err != nil {
+			return fmt.Errorf("copy %s: %v", base, err)
+		}
+		log(fmt.Sprintf("Copied %s.", base))
+
+		loopDev, err := findLoopForFile(sf)
+		if err != nil {
+			log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err))
+			continue
+		}
+		if err := reassociateLoopDevice(loopDev, dst); err != nil {
+			log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err))
+		} else {
+			log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
+		}
+	}
+
+	log("Copying remaining medium files...")
+	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
+		log(fmt.Sprintf("Warning: partial copy: %v", err))
+	}
+	if err := ctx.Err(); err != nil {
+		return err
+	}
+
+	mediumRebound := false
+	if err := bindMount(dstDir, "/run/live/medium"); err != nil {
+		log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
+	} else {
+		mediumRebound = true
+	}
+
+	log("Verifying live medium now served from RAM...")
+	status := s.LiveBootSource()
+	if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
+		return err
+	}
+	if status.InRAM {
+		log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
+	}
+	log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
+	return nil
+}
+
+func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
+	if status.InRAM {
+		return nil
+	}
+
+	// The live medium mount was not redirected to RAM. This is expected when
+	// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
+	// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
+	// because the CD-ROM mount is in use. Check whether files were at least
+	// copied to the tmpfs directory — that is sufficient for safe disconnection
+	// once the kernel has paged in all actively-used data.
+	files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
+	if len(files) > 0 {
+		if !mediumRebound {
+			log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
+			log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
+		}
+		return nil
+	}
+
+	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
+}
+
+func describeLiveBootSource(status LiveBootSource) string {
+	source := strings.TrimSpace(status.Device)
+	if source == "" {
+		source = strings.TrimSpace(status.Source)
+	}
+	if source == "" {
+		source = "unknown source"
+	}
+	switch strings.TrimSpace(status.Kind) {
+	case "ram":
+		return "RAM"
+	case "usb":
+		return "USB (" + source + ")"
+	case "cdrom":
+		return "CD-ROM (" + source + ")"
+	case "disk":
+		return "disk (" + source + ")"
+	default:
+		return source
+	}
+}
+
+func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+	fi, err := in.Stat()
+	if err != nil {
+		return err
+	}
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+	total := fi.Size()
+	var copied int64
+	buf := make([]byte, 4*1024*1024)
+	for {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		n, err := in.Read(buf)
+		if n > 0 {
+			if _, werr := out.Write(buf[:n]); werr != nil {
+				return werr
+			}
+			copied += int64(n)
+			if logFunc != nil && total > 0 {
+				pct := int(float64(copied) / float64(total) * 100)
+				logFunc(fmt.Sprintf("  %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
+			}
+		}
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+	}
+	return out.Sync()
+}
+
+func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
+	return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+		if err != nil {
+			return nil
+		}
+		rel, _ := filepath.Rel(src, path)
+		target := filepath.Join(dst, rel)
+		if fi.IsDir() {
+			return os.MkdirAll(target, fi.Mode())
+		}
+		if strings.HasSuffix(path, ".squashfs") {
+			return nil
+		}
+		if _, err := os.Stat(target); err == nil {
+			return nil
+		}
+		return copyFileLarge(ctx, path, target, nil)
+	})
+}
+
+func findLoopForFile(backingFile string) (string, error) {
+	out, err := exec.Command("losetup", "--list", "--json").Output()
+	if err != nil {
+		return "", err
+	}
+	var result struct {
+		Loopdevices []struct {
+			Name     string `json:"name"`
+			BackFile string `json:"back-file"`
+		} `json:"loopdevices"`
+	}
+	if err := json.Unmarshal(out, &result); err != nil {
+		return "", err
+	}
+	for _, dev := range result.Loopdevices {
+		if dev.BackFile == backingFile {
+			return dev.Name, nil
+		}
+	}
+	return "", fmt.Errorf("no loop device found for %s", backingFile)
+}
+
+// loopDeviceOffset returns the byte offset configured for the loop device,
+// or -1 if it cannot be determined.
+func loopDeviceOffset(loopDev string) int64 {
+	out, err := exec.Command("losetup", "--json", loopDev).Output()
+	if err != nil {
+		return -1
+	}
+	var result struct {
+		Loopdevices []struct {
+			Offset int64 `json:"offset"`
+		} `json:"loopdevices"`
+	}
+	if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
+		return -1
+	}
+	return result.Loopdevices[0].Offset
+}
+
+func reassociateLoopDevice(loopDev, newFile string) error {
+	// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
+	// typically set up with a non-zero offset (squashfs lives inside the ISO),
+	// so the ioctl returns EINVAL. Detect this early for a clear error message.
+	if off := loopDeviceOffset(loopDev); off > 0 {
+		return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
+	}
+	if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
+		return nil
+	}
+	return loopChangeFD(loopDev, newFile)
+}
--- a/audit/internal/platform/install_to_ram_linux.go
+++ b/audit/internal/platform/install_to_ram_linux.go
@@ -0,0 +1,33 @@
+//go:build linux
+
+package platform
+
+import (
+	"os"
+	"syscall"
+)
+
+const ioctlLoopChangeFD = 0x4C08
+
+func loopChangeFD(loopDev, newFile string) error {
+	lf, err := os.OpenFile(loopDev, os.O_RDWR, 0)
+	if err != nil {
+		return err
+	}
+	defer lf.Close()
+	nf, err := os.OpenFile(newFile, os.O_RDONLY, 0)
+	if err != nil {
+		return err
+	}
+	defer nf.Close()
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd())
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
+func bindMount(src, dst string) error {
+	return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
+}
--- a/audit/internal/platform/install_to_ram_other.go
+++ b/audit/internal/platform/install_to_ram_other.go
@@ -0,0 +1,13 @@
+//go:build !linux
+
+package platform
+
+import "errors"
+
+func loopChangeFD(loopDev, newFile string) error {
+	return errors.New("LOOP_CHANGE_FD not available on this platform")
+}
+
+func bindMount(src, dst string) error {
+	return errors.New("bind mount not available on this platform")
+}
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -0,0 +1,60 @@
+package platform
+
+import "testing"
+
+func TestInferLiveBootKind(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		fsType     string
+		source     string
+		deviceType string
+		transport  string
+		want       string
+	}{
+		{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
+		{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
+		{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
+		{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
+		{name: "unknown", source: "overlay", want: "unknown"},
+	}
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
+			if got != tc.want {
+				t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestVerifyInstallToRAMStatus(t *testing.T) {
+	t.Parallel()
+
+	dstDir := t.TempDir()
+
+	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
+		t.Fatalf("expected success for RAM-backed status, got %v", err)
+	}
+
+	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
+	if err == nil {
+		t.Fatal("expected verification failure when media is still on USB")
+	}
+	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
+		t.Fatalf("error=%q", got)
+	}
+}
+
+func TestDescribeLiveBootSource(t *testing.T) {
+	t.Parallel()
+
+	if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
+		t.Fatalf("got %q want RAM", got)
+	}
+	if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
+		t.Fatalf("got %q want /run/live/medium", got)
+	}
+}
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -0,0 +1,68 @@
+package platform
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"syscall"
+)
+
+// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
+// bee test worker processes that should be killed by KillTestWorkers.
+var workerPatterns = []string{
+	"bee-gpu-burn",
+	"stress-ng",
+	"stressapptest",
+	"memtester",
+	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
+	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
+	"nvvs",
+	"dcgmi",
+}
+
+// KilledProcess describes a process that was sent SIGKILL.
+type KilledProcess struct {
+	PID  int    `json:"pid"`
+	Name string `json:"name"`
+}
+
+// KillTestWorkers scans /proc for running test worker processes and sends
+// SIGKILL to each one found. It returns a list of killed processes.
+// Errors for individual processes (e.g. already exited) are silently ignored.
+func KillTestWorkers() []KilledProcess {
+	entries, err := os.ReadDir("/proc")
+	if err != nil {
+		return nil
+	}
+
+	var killed []KilledProcess
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		pid, err := strconv.Atoi(e.Name())
+		if err != nil {
+			continue
+		}
+		cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
+		if err != nil {
+			continue
+		}
+		// /proc/*/cmdline uses NUL bytes as argument separators.
+		args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
+		exe := strings.TrimSpace(args[0])
+		base := exe
+		if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
+			base = exe[idx+1:]
+		}
+		for _, pat := range workerPatterns {
+			if strings.Contains(base, pat) || strings.Contains(exe, pat) {
+				_ = syscall.Kill(pid, syscall.SIGKILL)
+				killed = append(killed, KilledProcess{PID: pid, Name: base})
+				break
+			}
+		}
+	}
+	return killed
+}
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -0,0 +1,328 @@
+package platform
+
+import (
+	"bufio"
+	"encoding/json"
+	"os"
+	"os/exec"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// LiveMetricSample is a single point-in-time snapshot of server metrics
+// collected for the web UI metrics page.
+type LiveMetricSample struct {
+	Timestamp  time.Time      `json:"ts"`
+	Fans       []FanReading   `json:"fans"`
+	Temps      []TempReading  `json:"temps"`
+	PowerW     float64        `json:"power_w"`
+	CPULoadPct float64        `json:"cpu_load_pct"`
+	MemLoadPct float64        `json:"mem_load_pct"`
+	GPUs       []GPUMetricRow `json:"gpus"`
+}
+
+// TempReading is a named temperature sensor value.
+type TempReading struct {
+	Name    string  `json:"name"`
+	Group   string  `json:"group,omitempty"`
+	Celsius float64 `json:"celsius"`
+}
+
+// SampleLiveMetrics collects a single metrics snapshot from all available
+// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
+// and system power (via ipmitool dcmi). Missing sources are silently skipped.
+func SampleLiveMetrics() LiveMetricSample {
+	s := LiveMetricSample{Timestamp: time.Now().UTC()}
+
+	// GPU metrics — try NVIDIA first, fall back to AMD
+	if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
+		s.GPUs = gpus
+	} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
+		s.GPUs = amdGPUs
+	}
+
+	// Fan speeds — skipped silently if ipmitool unavailable
+	fans, _ := sampleFanSpeeds()
+	s.Fans = fans
+
+	s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
+	if !hasTempGroup(s.Temps, "cpu") {
+		if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
+			s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
+		}
+	}
+
+	// System power — returns 0 if unavailable
+	s.PowerW = sampleSystemPower()
+
+	// CPU load — from /proc/stat
+	s.CPULoadPct = sampleCPULoadPct()
+
+	// Memory load — from /proc/meminfo
+	s.MemLoadPct = sampleMemLoadPct()
+
+	return s
+}
+
+// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
+// the overall CPU utilisation percentage.
+func sampleCPULoadPct() float64 {
+	total0, idle0 := readCPUStat()
+	if total0 == 0 {
+		return 0
+	}
+	time.Sleep(200 * time.Millisecond)
+	total1, idle1 := readCPUStat()
+	if total1 == 0 {
+		return 0
+	}
+	return cpuLoadPctBetween(total0, idle0, total1, idle1)
+}
+
+func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
+	dt := float64(total - prevTotal)
+	di := float64(idle - prevIdle)
+	if dt <= 0 {
+		return 0
+	}
+	pct := (1 - di/dt) * 100
+	if pct < 0 {
+		return 0
+	}
+	if pct > 100 {
+		return 100
+	}
+	return pct
+}
+
+func readCPUStat() (total, idle uint64) {
+	f, err := os.Open("/proc/stat")
+	if err != nil {
+		return 0, 0
+	}
+	defer f.Close()
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		line := sc.Text()
+		if !strings.HasPrefix(line, "cpu ") {
+			continue
+		}
+		fields := strings.Fields(line)[1:] // skip "cpu"
+		var vals [10]uint64
+		for i := 0; i < len(fields) && i < 10; i++ {
+			vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
+		}
+		// idle = idle + iowait
+		idle = vals[3] + vals[4]
+		for _, v := range vals {
+			total += v
+		}
+		return total, idle
+	}
+	return 0, 0
+}
+
+func sampleMemLoadPct() float64 {
+	f, err := os.Open("/proc/meminfo")
+	if err != nil {
+		return 0
+	}
+	defer f.Close()
+	vals := map[string]uint64{}
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		fields := strings.Fields(sc.Text())
+		if len(fields) >= 2 {
+			v, _ := strconv.ParseUint(fields[1], 10, 64)
+			vals[strings.TrimSuffix(fields[0], ":")] = v
+		}
+	}
+	total := vals["MemTotal"]
+	avail := vals["MemAvailable"]
+	if total == 0 {
+		return 0
+	}
+	used := total - avail
+	return float64(used) / float64(total) * 100
+}
+
+func hasTempGroup(temps []TempReading, group string) bool {
+	for _, t := range temps {
+		if t.Group == group {
+			return true
+		}
+	}
+	return false
+}
+
+func sampleLiveTemperatureReadings() []TempReading {
+	if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
+		return temps
+	}
+	return sampleLiveTempsViaIPMI()
+}
+
+func sampleLiveTempsViaSensorsJSON() []TempReading {
+	out, err := exec.Command("sensors", "-j").Output()
+	if err != nil || len(out) == 0 {
+		return nil
+	}
+
+	var doc map[string]map[string]any
+	if err := json.Unmarshal(out, &doc); err != nil {
+		return nil
+	}
+
+	chips := make([]string, 0, len(doc))
+	for chip := range doc {
+		chips = append(chips, chip)
+	}
+	sort.Strings(chips)
+
+	temps := make([]TempReading, 0, len(chips))
+	seen := map[string]struct{}{}
+	for _, chip := range chips {
+		features := doc[chip]
+		featureNames := make([]string, 0, len(features))
+		for name := range features {
+			featureNames = append(featureNames, name)
+		}
+		sort.Strings(featureNames)
+		for _, name := range featureNames {
+			if strings.EqualFold(name, "Adapter") {
+				continue
+			}
+			feature, ok := features[name].(map[string]any)
+			if !ok {
+				continue
+			}
+			value, ok := firstTempInputValue(feature)
+			if !ok || value <= 0 || value > 150 {
+				continue
+			}
+			group := classifyLiveTempGroup(chip, name)
+			if group == "gpu" {
+				continue
+			}
+			label := strings.TrimSpace(name)
+			if label == "" {
+				continue
+			}
+			if group == "ambient" {
+				label = compactAmbientTempName(chip, label)
+			}
+			key := group + "\x00" + label
+			if _, ok := seen[key]; ok {
+				continue
+			}
+			seen[key] = struct{}{}
+			temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
+		}
+	}
+	return temps
+}
+
+func sampleLiveTempsViaIPMI() []TempReading {
+	out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
+	if err != nil || len(out) == 0 {
+		return nil
+	}
+	var temps []TempReading
+	seen := map[string]struct{}{}
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		parts := strings.Split(line, "|")
+		if len(parts) < 3 {
+			continue
+		}
+		name := strings.TrimSpace(parts[0])
+		if name == "" {
+			continue
+		}
+		unit := strings.ToLower(strings.TrimSpace(parts[2]))
+		if !strings.Contains(unit, "degrees") {
+			continue
+		}
+		raw := strings.TrimSpace(parts[1])
+		if raw == "" || strings.EqualFold(raw, "na") {
+			continue
+		}
+		value, err := strconv.ParseFloat(raw, 64)
+		if err != nil || value <= 0 || value > 150 {
+			continue
+		}
+		group := classifyLiveTempGroup("", name)
+		if group == "gpu" {
+			continue
+		}
+		label := name
+		if group == "ambient" {
+			label = compactAmbientTempName("", label)
+		}
+		key := group + "\x00" + label
+		if _, ok := seen[key]; ok {
+			continue
+		}
+		seen[key] = struct{}{}
+		temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
+	}
+	return temps
+}
+
+func firstTempInputValue(feature map[string]any) (float64, bool) {
+	keys := make([]string, 0, len(feature))
+	for key := range feature {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		lower := strings.ToLower(key)
+		if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
+			continue
+		}
+		switch value := feature[key].(type) {
+		case float64:
+			return value, true
+		case string:
+			f, err := strconv.ParseFloat(value, 64)
+			if err == nil {
+				return f, true
+			}
+		}
+	}
+	return 0, false
+}
+
+func classifyLiveTempGroup(chip, name string) string {
+	text := strings.ToLower(strings.TrimSpace(chip + " " + name))
+	switch {
+	case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
+		return "gpu"
+	case strings.Contains(text, "coretemp"),
+		strings.Contains(text, "k10temp"),
+		strings.Contains(text, "zenpower"),
+		strings.Contains(text, "package id"),
+		strings.Contains(text, "x86_pkg_temp"),
+		strings.Contains(text, "tctl"),
+		strings.Contains(text, "tdie"),
+		strings.Contains(text, "tccd"),
+		strings.Contains(text, "cpu"),
+		strings.Contains(text, "peci"):
+		return "cpu"
+	default:
+		return "ambient"
+	}
+}
+
+func compactAmbientTempName(chip, name string) string {
+	chip = strings.TrimSpace(chip)
+	name = strings.TrimSpace(name)
+	if chip == "" || strings.EqualFold(chip, name) {
+		return name
+	}
+	if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
+		return name
+	}
+	return chip + " / " + name
+}
--- a/audit/internal/platform/live_metrics_test.go
+++ b/audit/internal/platform/live_metrics_test.go
@@ -0,0 +1,94 @@
+package platform
+
+import "testing"
+
+func TestFirstTempInputValue(t *testing.T) {
+	feature := map[string]any{
+		"temp1_input": 61.5,
+		"temp1_max":   80.0,
+	}
+	got, ok := firstTempInputValue(feature)
+	if !ok {
+		t.Fatal("expected value")
+	}
+	if got != 61.5 {
+		t.Fatalf("got %v want 61.5", got)
+	}
+}
+
+func TestClassifyLiveTempGroup(t *testing.T) {
+	tests := []struct {
+		chip string
+		name string
+		want string
+	}{
+		{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
+		{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
+		{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
+		{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
+	}
+	for _, tc := range tests {
+		if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
+			t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
+		}
+	}
+}
+
+func TestCompactAmbientTempName(t *testing.T) {
+	if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
+		t.Fatalf("got %q", got)
+	}
+	if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
+		t.Fatalf("got %q", got)
+	}
+}
+
+func TestCPULoadPctBetween(t *testing.T) {
+	tests := []struct {
+		name      string
+		prevTotal uint64
+		prevIdle  uint64
+		total     uint64
+		idle      uint64
+		want      float64
+	}{
+		{
+			name:      "busy half",
+			prevTotal: 100,
+			prevIdle:  40,
+			total:     200,
+			idle:      90,
+			want:      50,
+		},
+		{
+			name:      "fully busy",
+			prevTotal: 100,
+			prevIdle:  40,
+			total:     200,
+			idle:      40,
+			want:      100,
+		},
+		{
+			name:      "no progress",
+			prevTotal: 100,
+			prevIdle:  40,
+			total:     100,
+			idle:      40,
+			want:      0,
+		},
+		{
+			name:      "idle delta larger than total clamps to zero",
+			prevTotal: 100,
+			prevIdle:  40,
+			total:     200,
+			idle:      150,
+			want:      0,
+		},
+	}
+
+	for _, tc := range tests {
+		if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
+			t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
+		}
+	}
+}
--- a/audit/internal/platform/network.go
+++ b/audit/internal/platform/network.go
@@ -2,6 +2,7 @@ package platform

 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"os"
 	"os/exec"
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
 	out := make([]InterfaceInfo, 0, len(names))
 	for _, name := range names {
 		state := "unknown"
-		if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
-			fields := strings.Fields(string(raw))
-			if len(fields) >= 9 {
-				state = fields[8]
+		if up, err := interfaceAdminState(name); err == nil {
+			if up {
+				state = "up"
+			} else {
+				state = "down"
 			}
 		}

-		var ipv4 []string
-		if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
-			for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
-				fields := strings.Fields(line)
-				if len(fields) >= 4 {
-					ipv4 = append(ipv4, fields[3])
-				}
-			}
+		ipv4, err := interfaceIPv4Addrs(name)
+		if err != nil {
+			ipv4 = nil
 		}

 		out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
@@ -55,6 +52,119 @@ func (s *System) DefaultRoute() string {
 	return ""
 }

+func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
+	names, err := listInterfaceNames()
+	if err != nil {
+		return NetworkSnapshot{}, err
+	}
+
+	snapshot := NetworkSnapshot{
+		Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
+	}
+	for _, name := range names {
+		up, err := interfaceAdminState(name)
+		if err != nil {
+			return NetworkSnapshot{}, err
+		}
+		ipv4, err := interfaceIPv4Addrs(name)
+		if err != nil {
+			return NetworkSnapshot{}, err
+		}
+		snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
+			Name: name,
+			Up:   up,
+			IPv4: ipv4,
+		})
+	}
+
+	if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
+		for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
+			line = strings.TrimSpace(line)
+			if line != "" {
+				snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
+			}
+		}
+	}
+
+	if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
+		snapshot.ResolvConf = string(raw)
+	}
+
+	return snapshot, nil
+}
+
+func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
+	var errs []string
+
+	for _, iface := range snapshot.Interfaces {
+		if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
+			errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
+			continue
+		}
+		if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
+			errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
+		}
+		for _, cidr := range iface.IPv4 {
+			if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
+				detail := strings.TrimSpace(string(raw))
+				if detail != "" {
+					errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
+				} else {
+					errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
+				}
+			}
+		}
+		state := "down"
+		if iface.Up {
+			state = "up"
+		}
+		if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
+			errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
+		}
+	}
+
+	if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
+		var exitErr *exec.ExitError
+		if !errors.As(err, &exitErr) {
+			errs = append(errs, fmt.Sprintf("clear default route: %v", err))
+		}
+	}
+	for _, route := range snapshot.DefaultRoutes {
+		fields := strings.Fields(route)
+		if len(fields) == 0 {
+			continue
+		}
+		// Strip state flags that ip-route(8) does not accept as add arguments.
+		filtered := fields[:0]
+		for _, f := range fields {
+			switch f {
+			case "linkdown", "dead", "onlink", "pervasive":
+				// skip
+			default:
+				filtered = append(filtered, f)
+			}
+		}
+		args := append([]string{"route", "add"}, filtered...)
+		if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
+			detail := strings.TrimSpace(string(raw))
+			if detail != "" {
+				errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
+			} else {
+				errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
+			}
+		}
+	}
+
+	if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
+		errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
+	}
+
+	if len(errs) > 0 {
+		return errors.New(strings.Join(errs, "; "))
+	}
+	return nil
+}
+
 func (s *System) DHCPOne(iface string) (string, error) {
 	var out bytes.Buffer
 	if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
@@ -131,6 +241,65 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
 	return out.String(), nil
 }

+// SetInterfaceState brings a network interface up or down.
+func (s *System) SetInterfaceState(iface string, up bool) error {
+	state := "down"
+	if up {
+		state = "up"
+	}
+	return exec.Command("ip", "link", "set", "dev", iface, state).Run()
+}
+
+// GetInterfaceState returns true if the interface is UP.
+func (s *System) GetInterfaceState(iface string) (bool, error) {
+	return interfaceAdminState(iface)
+}
+
+func interfaceAdminState(iface string) (bool, error) {
+	raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
+	if err != nil {
+		return false, err
+	}
+	return parseInterfaceAdminState(string(raw))
+}
+
+func parseInterfaceAdminState(raw string) (bool, error) {
+	start := strings.IndexByte(raw, '<')
+	if start == -1 {
+		return false, fmt.Errorf("ip link output missing flags")
+	}
+	end := strings.IndexByte(raw[start+1:], '>')
+	if end == -1 {
+		return false, fmt.Errorf("ip link output missing flag terminator")
+	}
+	flags := strings.Split(raw[start+1:start+1+end], ",")
+	for _, flag := range flags {
+		if strings.TrimSpace(flag) == "UP" {
+			return true, nil
+		}
+	}
+	return false, nil
+}
+
+func interfaceIPv4Addrs(iface string) ([]string, error) {
+	raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
+	if err != nil {
+		var exitErr *exec.ExitError
+		if errors.As(err, &exitErr) {
+			return nil, nil
+		}
+		return nil, err
+	}
+	var ipv4 []string
+	for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
+		fields := strings.Fields(line)
+		if len(fields) >= 4 {
+			ipv4 = append(ipv4, fields[3])
+		}
+	}
+	return ipv4, nil
+}
+
 func listInterfaceNames() ([]string, error) {
 	raw, err := exec.Command("ip", "-o", "link", "show").Output()
 	if err != nil {
--- a/audit/internal/platform/network_test.go
+++ b/audit/internal/platform/network_test.go
@@ -0,0 +1,46 @@
+package platform
+
+import "testing"
+
+func TestParseInterfaceAdminState(t *testing.T) {
+	tests := []struct {
+		name    string
+		raw     string
+		want    bool
+		wantErr bool
+	}{
+		{
+			name: "admin up with no carrier",
+			raw:  "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
+			want: true,
+		},
+		{
+			name: "admin down",
+			raw:  "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
+			want: false,
+		},
+		{
+			name:    "malformed output",
+			raw:     "2: enp1s0: mtu 1500 state DOWN\n",
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := parseInterfaceAdminState(tt.raw)
+			if tt.wantErr {
+				if err == nil {
+					t.Fatal("expected error")
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if got != tt.want {
+				t.Fatalf("got %v want %v", got, tt.want)
+			}
+		})
+	}
+}
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -0,0 +1,209 @@
+package platform
+
+import (
+	"context"
+	"fmt"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
+	normalizeNvidiaStressOptions(&opts)
+
+	job, err := buildNvidiaStressJob(opts)
+	if err != nil {
+		return "", err
+	}
+
+	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
+		job,
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
+}
+
+func nvidiaStressArchivePrefix(loader string) string {
+	switch strings.TrimSpace(strings.ToLower(loader)) {
+	case NvidiaStressLoaderJohn:
+		return "gpu-nvidia-john"
+	case NvidiaStressLoaderNCCL:
+		return "gpu-nvidia-nccl"
+	default:
+		return "gpu-nvidia-burn"
+	}
+}
+
+func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
+	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
+	if err != nil {
+		return satJob{}, err
+	}
+
+	loader := strings.TrimSpace(strings.ToLower(opts.Loader))
+	switch loader {
+	case "", NvidiaStressLoaderBuiltin:
+		cmd := []string{
+			"bee-gpu-burn",
+			"--seconds", strconv.Itoa(opts.DurationSec),
+			"--size-mb", strconv.Itoa(opts.SizeMB),
+		}
+		if opts.StaggerSeconds > 0 && len(selected) > 1 {
+			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
+		}
+		if len(selected) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(selected))
+		}
+		return satJob{
+			name:       "03-bee-gpu-burn.log",
+			cmd:        cmd,
+			collectGPU: true,
+			gpuIndices: selected,
+		}, nil
+	case NvidiaStressLoaderJohn:
+		cmd := []string{
+			"bee-john-gpu-stress",
+			"--seconds", strconv.Itoa(opts.DurationSec),
+		}
+		if opts.StaggerSeconds > 0 && len(selected) > 1 {
+			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
+		}
+		if len(selected) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(selected))
+		}
+		return satJob{
+			name:       "03-john-gpu-stress.log",
+			cmd:        cmd,
+			collectGPU: true,
+			gpuIndices: selected,
+		}, nil
+	case NvidiaStressLoaderNCCL:
+		cmd := []string{
+			"bee-nccl-gpu-stress",
+			"--seconds", strconv.Itoa(opts.DurationSec),
+		}
+		if len(selected) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(selected))
+		}
+		return satJob{
+			name:       "03-bee-nccl-gpu-stress.log",
+			cmd:        cmd,
+			collectGPU: true,
+			gpuIndices: selected,
+		}, nil
+	default:
+		return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
+	}
+}
+
+func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
+	if opts.DurationSec <= 0 {
+		opts.DurationSec = 300
+	}
+	// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
+	switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
+	case "", NvidiaStressLoaderBuiltin:
+		opts.Loader = NvidiaStressLoaderBuiltin
+	case NvidiaStressLoaderJohn:
+		opts.Loader = NvidiaStressLoaderJohn
+	case NvidiaStressLoaderNCCL:
+		opts.Loader = NvidiaStressLoaderNCCL
+	default:
+		opts.Loader = NvidiaStressLoaderBuiltin
+	}
+	opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
+	opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
+}
+
+func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
+	all, err := listNvidiaGPUIndices()
+	if err != nil {
+		return nil, err
+	}
+	if len(all) == 0 {
+		return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
+	}
+
+	selected := all
+	if len(include) > 0 {
+		want := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			want[idx] = struct{}{}
+		}
+		selected = selected[:0]
+		for _, idx := range all {
+			if _, ok := want[idx]; ok {
+				selected = append(selected, idx)
+			}
+		}
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
+	}
+	out := append([]int(nil), selected...)
+	sort.Ints(out)
+	return out, nil
+}
+
+func listNvidiaGPUIndices() ([]int, error) {
+	out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi: %w", err)
+	}
+	var indices []int
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		idx, err := strconv.Atoi(line)
+		if err != nil {
+			continue
+		}
+		indices = append(indices, idx)
+	}
+	return dedupeSortedIndices(indices), nil
+}
+
+func dedupeSortedIndices(values []int) []int {
+	if len(values) == 0 {
+		return nil
+	}
+	seen := make(map[int]struct{}, len(values))
+	out := make([]int, 0, len(values))
+	for _, value := range values {
+		if value < 0 {
+			continue
+		}
+		if _, ok := seen[value]; ok {
+			continue
+		}
+		seen[value] = struct{}{}
+		out = append(out, value)
+	}
+	sort.Ints(out)
+	return out
+}
+
+func joinIndexList(values []int) string {
+	parts := make([]string, 0, len(values))
+	for _, value := range values {
+		parts = append(parts, strconv.Itoa(value))
+	}
+	return strings.Join(parts, ",")
+}
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -0,0 +1,563 @@
+package platform
+
+import (
+	"archive/tar"
+	"bytes"
+	"compress/gzip"
+	"context"
+	"encoding/csv"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"syscall"
+	"time"
+)
+
+// PlatformStressCycle defines one load+idle cycle.
+type PlatformStressCycle struct {
+	LoadSec int // seconds of simultaneous CPU+GPU stress
+	IdleSec int // seconds of idle monitoring after load cut
+}
+
+// PlatformStressOptions controls the thermal cycling test.
+type PlatformStressOptions struct {
+	Cycles     []PlatformStressCycle
+	Components []string // if empty: run all; values: "cpu", "gpu"
+}
+
+// platformStressRow is one second of telemetry.
+type platformStressRow struct {
+	ElapsedSec   float64
+	Cycle        int
+	Phase        string // "load" | "idle"
+	CPULoadPct   float64
+	MaxCPUTempC  float64
+	MaxGPUTempC  float64
+	SysPowerW    float64
+	FanMinRPM    float64
+	FanMaxRPM    float64
+	GPUThrottled bool
+}
+
+// RunPlatformStress runs repeated load+idle thermal cycling.
+// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
+// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
+func (s *System) RunPlatformStress(
+	ctx context.Context,
+	baseDir string,
+	opts PlatformStressOptions,
+	logFunc func(string),
+) (string, error) {
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if len(opts.Cycles) == 0 {
+		return "", fmt.Errorf("no cycles defined")
+	}
+	if err := os.MkdirAll(baseDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
+	}
+
+	stamp := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir run dir: %w", err)
+	}
+
+	hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
+	hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
+
+	vendor := s.DetectGPUVendor()
+	logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
+
+	var rows []platformStressRow
+	start := time.Now()
+
+	var analyses []cycleAnalysis
+
+	for i, cycle := range opts.Cycles {
+		if ctx.Err() != nil {
+			break
+		}
+		cycleNum := i + 1
+		logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
+
+		// ── LOAD PHASE ───────────────────────────────────────────────────────
+		loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
+		var wg sync.WaitGroup
+
+		// CPU stress
+		if hasCPU {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				cpuCmd, err := buildCPUStressCmd(loadCtx)
+				if err != nil {
+					logFunc("CPU stress: " + err.Error())
+					return
+				}
+				_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
+			}()
+		}
+
+		// GPU stress
+		if hasGPU {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
+				if gpuCmd == nil {
+					return
+				}
+				_ = gpuCmd.Wait()
+			}()
+		}
+
+		// Monitoring goroutine for load phase
+		loadRows := collectPhase(loadCtx, cycleNum, "load", start)
+		for _, r := range loadRows {
+			logFunc(formatPlatformRow(r))
+		}
+		rows = append(rows, loadRows...)
+		loadCancel()
+		wg.Wait()
+
+		if len(loadRows) > 0 {
+			logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
+		}
+
+		// ── IDLE PHASE ───────────────────────────────────────────────────────
+		idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
+		idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
+		for _, r := range idleRows {
+			logFunc(formatPlatformRow(r))
+		}
+		rows = append(rows, idleRows...)
+		idleCancel()
+
+		// Per-cycle analysis
+		an := analyzePlatformCycle(loadRows, idleRows)
+		analyses = append(analyses, an)
+		logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
+			cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
+	}
+
+	// Write CSV
+	csvData := writePlatformCSV(rows)
+	_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
+
+	// Write summary
+	summary := writePlatformSummary(opts, analyses)
+	logFunc("--- Summary ---")
+	for _, line := range strings.Split(summary, "\n") {
+		if line != "" {
+			logFunc(line)
+		}
+	}
+	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
+
+	return runDir, nil
+}
+
+// collectPhase samples live metrics every second until ctx is done.
+func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
+	var rows []platformStressRow
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return rows
+		case <-ticker.C:
+			sample := SampleLiveMetrics()
+			rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
+		}
+	}
+}
+
+func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
+	r := platformStressRow{
+		ElapsedSec: time.Since(testStart).Seconds(),
+		Cycle:      cycle,
+		Phase:      phase,
+		CPULoadPct: s.CPULoadPct,
+		SysPowerW:  s.PowerW,
+	}
+	for _, t := range s.Temps {
+		switch t.Group {
+		case "cpu":
+			if t.Celsius > r.MaxCPUTempC {
+				r.MaxCPUTempC = t.Celsius
+			}
+		case "gpu":
+			if t.Celsius > r.MaxGPUTempC {
+				r.MaxGPUTempC = t.Celsius
+			}
+		}
+	}
+	for _, g := range s.GPUs {
+		if g.TempC > r.MaxGPUTempC {
+			r.MaxGPUTempC = g.TempC
+		}
+	}
+	if len(s.Fans) > 0 {
+		r.FanMinRPM = s.Fans[0].RPM
+		r.FanMaxRPM = s.Fans[0].RPM
+		for _, f := range s.Fans[1:] {
+			if f.RPM < r.FanMinRPM {
+				r.FanMinRPM = f.RPM
+			}
+			if f.RPM > r.FanMaxRPM {
+				r.FanMaxRPM = f.RPM
+			}
+		}
+	}
+	return r
+}
+
+func formatPlatformRow(r platformStressRow) string {
+	throttle := ""
+	if r.GPUThrottled {
+		throttle = " THROTTLE"
+	}
+	fans := ""
+	if r.FanMinRPM > 0 {
+		fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
+	}
+	return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
+		r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
+}
+
+func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
+	var an cycleAnalysis
+	for _, r := range loadRows {
+		if r.MaxCPUTempC > an.maxCPUTemp {
+			an.maxCPUTemp = r.MaxCPUTempC
+		}
+		if r.MaxGPUTempC > an.maxGPUTemp {
+			an.maxGPUTemp = r.MaxGPUTempC
+		}
+		if r.SysPowerW > an.maxPower {
+			an.maxPower = r.SysPowerW
+		}
+		if r.GPUThrottled {
+			an.throttled = true
+		}
+	}
+	// Fan RPM at cut = avg of last 5 load rows
+	if n := len(loadRows); n > 0 {
+		window := loadRows
+		if n > 5 {
+			window = loadRows[n-5:]
+		}
+		var sum float64
+		var cnt int
+		for _, r := range window {
+			if r.FanMinRPM > 0 {
+				sum += (r.FanMinRPM + r.FanMaxRPM) / 2
+				cnt++
+			}
+		}
+		if cnt > 0 {
+			an.fanAtCutAvg = sum / float64(cnt)
+		}
+	}
+	// Fan RPM min in first 15s of idle
+	an.fanMin15s = an.fanAtCutAvg
+	var cutElapsed float64
+	if len(loadRows) > 0 {
+		cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
+	}
+	for _, r := range idleRows {
+		if r.ElapsedSec > cutElapsed+15 {
+			break
+		}
+		avg := (r.FanMinRPM + r.FanMaxRPM) / 2
+		if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
+			an.fanMin15s = avg
+		}
+	}
+	if an.fanAtCutAvg > 0 {
+		an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
+	}
+	return an
+}
+
+type cycleAnalysis struct {
+	maxCPUTemp  float64
+	maxGPUTemp  float64
+	maxPower    float64
+	throttled   bool
+	fanAtCutAvg float64
+	fanMin15s   float64
+	fanDropPct  float64
+}
+
+func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
+	fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
+
+	totalThrottle := 0
+	totalFanWarn := 0
+	for i, an := range analyses {
+		cycle := opts.Cycles[i]
+		fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
+		fmt.Fprintf(&b, "  Max CPU temp: %.1f°C\n", an.maxCPUTemp)
+		fmt.Fprintf(&b, "  Max GPU temp: %.1f°C\n", an.maxGPUTemp)
+		fmt.Fprintf(&b, "  Max sys power: %.0f W\n", an.maxPower)
+		if an.throttled {
+			fmt.Fprintf(&b, "  Throttle: DETECTED\n")
+			totalThrottle++
+		} else {
+			fmt.Fprintf(&b, "  Throttle: none\n")
+		}
+		if an.fanAtCutAvg > 0 {
+			fmt.Fprintf(&b, "  Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
+			fmt.Fprintf(&b, "  Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
+			if an.fanDropPct > 20 {
+				fmt.Fprintf(&b, "  Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
+				totalFanWarn++
+			} else {
+				fmt.Fprintf(&b, "  Fan response: OK\n")
+			}
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
+	if totalThrottle > 0 {
+		fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
+	} else if totalFanWarn > 0 {
+		fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
+	} else {
+		fmt.Fprintf(&b, "Overall: PASS\n")
+	}
+	return b.String()
+}
+
+func writePlatformCSV(rows []platformStressRow) []byte {
+	var buf bytes.Buffer
+	w := csv.NewWriter(&buf)
+	_ = w.Write([]string{
+		"elapsed_sec", "cycle", "phase",
+		"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
+		"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
+	})
+	for _, r := range rows {
+		throttled := "0"
+		if r.GPUThrottled {
+			throttled = "1"
+		}
+		_ = w.Write([]string{
+			strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
+			strconv.Itoa(r.Cycle),
+			r.Phase,
+			strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
+			strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
+			strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
+			strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
+			strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
+			strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
+			throttled,
+		})
+	}
+	w.Flush()
+	return buf.Bytes()
+}
+
+// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
+func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
+	path, err := satLookPath("stressapptest")
+	if err != nil {
+		return nil, fmt.Errorf("stressapptest not found: %w", err)
+	}
+	// Use a very long duration; the context timeout will kill it at the right time.
+	cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
+	if threads := platformStressCPUThreads(); threads > 0 {
+		cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
+	}
+	if mb := platformStressMemoryMB(); mb > 0 {
+		cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
+	}
+	cmd := exec.CommandContext(ctx, path, cmdArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	if err := startLowPriorityCmd(cmd, 15); err != nil {
+		return nil, fmt.Errorf("stressapptest start: %w", err)
+	}
+	return cmd, nil
+}
+
+// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
+// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
+func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
+	switch strings.ToLower(vendor) {
+	case "amd":
+		return buildAMDGPUStressCmd(ctx, durSec)
+	case "nvidia":
+		return buildNvidiaGPUStressCmd(ctx, durSec)
+	}
+	return nil
+}
+
+func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
+	rvsArgs, err := resolveRVSCommand()
+	if err != nil {
+		return nil
+	}
+	rvsPath := rvsArgs[0]
+	cfg := fmt.Sprintf(`actions:
+- name: gst_platform
+  device: all
+  module: gst
+  parallel: true
+  duration: %d`, durSec*1000) + `
+  copy_matrix: false
+  target_stress: 90
+  matrix_size_a: 8640
+  matrix_size_b: 8640
+  matrix_size_c: 8640
+`
+	cfgFile := "/tmp/bee-platform-gst.conf"
+	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
+	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	_ = startLowPriorityCmd(cmd, 10)
+	return cmd
+}
+
+func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
+	path, err := satLookPath("bee-gpu-burn")
+	if err != nil {
+		path, err = satLookPath("bee-gpu-stress")
+	}
+	if err != nil {
+		return nil
+	}
+	// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
+	// Process group kill via Setpgid+Cancel is kept as a safety net for cases
+	// where the context is cancelled early (user stop, parent timeout).
+	cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	_ = startLowPriorityCmd(cmd, 10)
+	return cmd
+}
+
+func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+	if cmd.Process != nil {
+		_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
+	}
+	return nil
+}
+
+func platformStressCPUThreads() int {
+	if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
+		return n
+	}
+	cpus := runtime.NumCPU()
+	switch {
+	case cpus <= 2:
+		return 1
+	case cpus <= 8:
+		return cpus - 1
+	default:
+		return cpus - 2
+	}
+}
+
+func platformStressMemoryMB() int {
+	if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
+		return mb
+	}
+	free := freeMemBytes()
+	if free <= 0 {
+		return 0
+	}
+	mb := int((free * 60) / 100 / (1024 * 1024))
+	if mb < 1024 {
+		return 1024
+	}
+	return mb
+}
+
+func containsComponent(components []string, name string) bool {
+	for _, c := range components {
+		if c == name {
+			return true
+		}
+	}
+	return false
+}
+
+func packPlatformDir(dir, dest string) error {
+	f, err := os.Create(dest)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	gz := gzip.NewWriter(f)
+	defer gz.Close()
+	tw := tar.NewWriter(gz)
+	defer tw.Close()
+
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return err
+	}
+	base := filepath.Base(dir)
+	for _, e := range entries {
+		if e.IsDir() {
+			continue
+		}
+		fpath := filepath.Join(dir, e.Name())
+		data, err := os.ReadFile(fpath)
+		if err != nil {
+			continue
+		}
+		hdr := &tar.Header{
+			Name:    filepath.Join(base, e.Name()),
+			Size:    int64(len(data)),
+			Mode:    0644,
+			ModTime: time.Now(),
+		}
+		if err := tw.WriteHeader(hdr); err != nil {
+			return err
+		}
+		if _, err := tw.Write(data); err != nil {
+			return err
+		}
+	}
+	return nil
+}
--- a/audit/internal/platform/platform_stress_test.go
+++ b/audit/internal/platform/platform_stress_test.go
@@ -0,0 +1,34 @@
+package platform
+
+import (
+	"runtime"
+	"testing"
+)
+
+func TestPlatformStressCPUThreadsOverride(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
+	if got := platformStressCPUThreads(); got != 7 {
+		t.Fatalf("platformStressCPUThreads=%d want 7", got)
+	}
+}
+
+func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
+	got := platformStressCPUThreads()
+	if got < 1 {
+		t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
+	}
+	if got > runtime.NumCPU() {
+		t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
+	}
+	if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
+		t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
+	}
+}
+
+func TestPlatformStressMemoryMBOverride(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
+	if got := platformStressMemoryMB(); got != 8192 {
+		t.Fatalf("platformStressMemoryMB=%d want 8192", got)
+	}
+}
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -1,6 +1,7 @@
 package platform

 import (
+	"bufio"
 	"os"
 	"os/exec"
 	"strings"
@@ -114,6 +115,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 	}

 	s.collectGPURuntimeHealth(vendor, &health)
+	s.collectToRAMHealth(&health)
+	s.collectUSBExportHealth(&health)

 	if health.Status != "FAILED" && len(health.Issues) > 0 {
 		health.Status = "PARTIAL"
@@ -135,9 +138,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 	case "nvidia":
 		tools = append(tools, s.CheckTools([]string{
 			"nvidia-smi",
+			"dcgmi",
+			"nv-hostengine",
 			"nvidia-bug-report.sh",
-			"bee-gpu-stress",
+			"bee-gpu-burn",
+			"bee-john-gpu-stress",
+			"bee-nccl-gpu-stress",
+			"all_reduce_perf",
 		})...)
+		tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
 	case "amd":
 		tool := ToolStatus{Name: "rocm-smi"}
 		if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -152,11 +161,127 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 	return tools
 }

+func resolvedToolStatus(display string, candidates ...string) ToolStatus {
+	for _, candidate := range candidates {
+		path, err := exec.LookPath(candidate)
+		if err == nil {
+			return ToolStatus{Name: display, Path: path, OK: true}
+		}
+	}
+	return ToolStatus{Name: display}
+}
+
+// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
+// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
+// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
+func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
+	inRAM := s.IsLiveMediaInRAM()
+	active := toramActive()
+	switch {
+	case inRAM:
+		health.ToRAMStatus = "ok"
+	case active:
+		// toram was requested but medium is not yet/no longer in RAM
+		health.ToRAMStatus = "failed"
+		health.Issues = append(health.Issues, schema.RuntimeIssue{
+			Code:        "toram_copy_failed",
+			Severity:    "warning",
+			Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
+		})
+	default:
+		health.ToRAMStatus = "warning"
+	}
+}
+
+// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
+// suitable for log export. Sets USBExportPath to the first match found.
+func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
+	health.USBExportPath = findUSBExportMount()
+}
+
+// findUSBExportMount returns the mount point of the first writable USB filesystem
+// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
+// has USB transport. Returns "" if none found.
+func findUSBExportMount() string {
+	f, err := os.Open("/proc/mounts")
+	if err != nil {
+		return ""
+	}
+	defer f.Close()
+
+	// fs types that are expected on USB export drives
+	exportFSTypes := map[string]bool{
+		"vfat":  true,
+		"exfat": true,
+		"ext2":  true,
+		"ext3":  true,
+		"ext4":  true,
+		"ntfs":  true,
+		"ntfs3": true,
+		"fuseblk": true,
+	}
+
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		// fields: device mountpoint fstype options dump pass
+		fields := strings.Fields(scanner.Text())
+		if len(fields) < 4 {
+			continue
+		}
+		device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
+		if !exportFSTypes[strings.ToLower(fsType)] {
+			continue
+		}
+		// Skip read-only mounts
+		opts := strings.Split(options, ",")
+		readOnly := false
+		for _, o := range opts {
+			if strings.TrimSpace(o) == "ro" {
+				readOnly = true
+				break
+			}
+		}
+		if readOnly {
+			continue
+		}
+		// Check USB transport via lsblk on the device (or its parent disk for partitions).
+		if !strings.HasPrefix(device, "/dev/") {
+			continue
+		}
+		checkDev := device
+		// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
+		// Strip trailing partition digits to get the parent disk name.
+		if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
+			checkDev = trimmed
+		}
+		if blockDeviceTransport(checkDev) == "usb" {
+			return mountPoint
+		}
+	}
+	return ""
+}
+
 func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
 	lsmodText := commandText("lsmod")

 	switch vendor {
 	case "nvidia":
+		if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
+			health.NvidiaGSPMode = strings.TrimSpace(string(raw))
+			if health.NvidiaGSPMode == "gsp-stuck" {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "nvidia_gsp_stuck",
+					Severity:    "critical",
+					Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
+				})
+			} else if health.NvidiaGSPMode == "gsp-off" {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "nvidia_gsp_disabled",
+					Severity:    "warning",
+					Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
+				})
+			}
+		}
 		health.DriverReady = strings.Contains(lsmodText, "nvidia ")
 		if !health.DriverReady {
 			health.Issues = append(health.Issues, schema.RuntimeIssue{
@@ -176,8 +301,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
 			health.DriverReady = true
 		}

-		if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
-			out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
+		if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
+			out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
 			if err == nil {
 				health.CUDAReady = true
 			} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -2,10 +2,12 @@ package platform

 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -18,7 +20,7 @@ type FanStressOptions struct {
 	Phase1DurSec int   // first load phase duration in seconds (default 300)
 	PauseSec     int   // pause between the two load phases (default 60)
 	Phase2DurSec int   // second load phase duration in seconds (default 300)
-	SizeMB       int   // GPU memory to allocate per GPU during stress (default 64)
+	SizeMB       int   // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
 	GPUIndices   []int // which GPU indices to stress (empty = all detected)
 }

@@ -49,6 +51,18 @@ type FanStressRow struct {
 	SysPowerW    float64 // DCMI system power reading
 }

+type cachedPowerReading struct {
+	Value     float64
+	UpdatedAt time.Time
+}
+
+var (
+	systemPowerCacheMu sync.Mutex
+	systemPowerCache   cachedPowerReading
+)
+
+const systemPowerHoldTTL = 15 * time.Second
+
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -128,26 +142,21 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		stats.OK++
 	}

-	// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
+	// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
 	loadPhase := func(phaseName, stepName string, durSec int) {
 		if ctx.Err() != nil {
 			return
 		}
 		setPhase(phaseName)
-		var env []string
-		if len(opts.GPUIndices) > 0 {
-			ids := make([]string, len(opts.GPUIndices))
-			for i, idx := range opts.GPUIndices {
-				ids[i] = strconv.Itoa(idx)
-			}
-			env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
-		}
 		cmd := []string{
-			"bee-gpu-stress",
+			"bee-gpu-burn",
 			"--seconds", strconv.Itoa(durSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
-		out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
+		if len(opts.GPUIndices) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
+		}
+		out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
 		_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
 		if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
 			fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
@@ -214,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		return "", err
 	}

-	archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", err
-	}
-	return archive, nil
+	return runDir, nil
 }

 func applyFanStressDefaults(opts *FanStressOptions) {
@@ -234,9 +239,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
 	if opts.Phase2DurSec <= 0 {
 		opts.Phase2DurSec = 300
 	}
-	if opts.SizeMB <= 0 {
-		opts.SizeMB = 64
-	}
+	// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
+	// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
 }

 // sampleFanStressRow collects all metrics for one telemetry sample.
@@ -304,41 +308,148 @@ func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
 // sampleFanSpeeds reads fan RPM values from ipmitool sdr.
 func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
+	if err == nil {
+		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
+			return fans, nil
+		}
+	}
+	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
+	if len(fans) > 0 {
+		return fans, nil
+	}
 	if err != nil {
 		return nil, err
 	}
-	return parseFanSpeeds(string(out)), nil
+	return nil, sensorsErr
 }

 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
-// Line format: "FAN1             | 2400.000   | RPM        | ok"
+// Handles two formats:
+//
+//	Old: "FAN1 | 2400.000 | RPM | ok"           (value in col[1], unit in col[2])
+//	New: "FAN1 | 41h | ok | 29.1 | 4340 RPM"   (value+unit combined in last col)
 func parseFanSpeeds(raw string) []FanReading {
 	var fans []FanReading
 	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
 		parts := strings.Split(line, "|")
-		if len(parts) < 3 {
+		if len(parts) < 2 {
 			continue
 		}
-		unit := strings.TrimSpace(parts[2])
-		if !strings.EqualFold(unit, "RPM") {
+		name := strings.TrimSpace(parts[0])
+		// Find the first field that contains "RPM" (either as a standalone unit or inline)
+		rpmVal := 0.0
+		found := false
+		for _, p := range parts[1:] {
+			p = strings.TrimSpace(p)
+			if !strings.Contains(strings.ToUpper(p), "RPM") {
+				continue
+			}
+			if strings.EqualFold(p, "RPM") {
+				continue // unit-only column in old format; value is in previous field
+			}
+			val, err := parseFanRPMValue(p)
+			if err == nil {
+				rpmVal = val
+				found = true
+				break
+			}
+		}
+		// Old format: unit "RPM" is in col[2], value is in col[1]
+		if !found && len(parts) >= 3 && strings.EqualFold(strings.TrimSpace(parts[2]), "RPM") {
+			valStr := strings.TrimSpace(parts[1])
+			if !strings.EqualFold(valStr, "na") && !strings.EqualFold(valStr, "disabled") && valStr != "" {
+				if val, err := parseFanRPMValue(valStr); err == nil {
+					rpmVal = val
+					found = true
+				}
+			}
+		}
+		if !found {
 			continue
 		}
-		valStr := strings.TrimSpace(parts[1])
-		if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
-			continue
-		}
-		val, err := strconv.ParseFloat(valStr, 64)
-		if err != nil {
-			continue
-		}
-		fans = append(fans, FanReading{
-			Name: strings.TrimSpace(parts[0]),
-			RPM:  val,
-		})
+		fans = append(fans, FanReading{Name: name, RPM: rpmVal})
 	}
 	return fans
 }

+func parseFanRPMValue(raw string) (float64, error) {
+	fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
+	if len(fields) == 0 {
+		return 0, strconv.ErrSyntax
+	}
+	return strconv.ParseFloat(fields[0], 64)
+}
+
+func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
+	out, err := exec.Command("sensors", "-j").Output()
+	if err != nil || len(out) == 0 {
+		return nil, err
+	}
+	var doc map[string]map[string]any
+	if err := json.Unmarshal(out, &doc); err != nil {
+		return nil, err
+	}
+	chips := make([]string, 0, len(doc))
+	for chip := range doc {
+		chips = append(chips, chip)
+	}
+	sort.Strings(chips)
+	var fans []FanReading
+	seen := map[string]struct{}{}
+	for _, chip := range chips {
+		features := doc[chip]
+		names := make([]string, 0, len(features))
+		for name := range features {
+			names = append(names, name)
+		}
+		sort.Strings(names)
+		for _, name := range names {
+			feature, ok := features[name].(map[string]any)
+			if !ok {
+				continue
+			}
+			rpm, ok := firstFanInputValue(feature)
+			if !ok || rpm <= 0 {
+				continue
+			}
+			label := strings.TrimSpace(name)
+			if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
+				label = chip + " / " + label
+			}
+			if _, ok := seen[label]; ok {
+				continue
+			}
+			seen[label] = struct{}{}
+			fans = append(fans, FanReading{Name: label, RPM: rpm})
+		}
+	}
+	return fans, nil
+}
+
+func firstFanInputValue(feature map[string]any) (float64, bool) {
+	keys := make([]string, 0, len(feature))
+	for key := range feature {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		lower := strings.ToLower(key)
+		if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
+			continue
+		}
+		switch value := feature[key].(type) {
+		case float64:
+			return value, true
+		case string:
+			f, err := strconv.ParseFloat(value, 64)
+			if err == nil {
+				return f, true
+			}
+		}
+	}
+	return 0, false
+}
+
 // sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
 func sampleCPUMaxTemp() float64 {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
@@ -404,11 +515,17 @@ func sampleCPUTempViaSensors() float64 {

 // sampleSystemPower reads system power draw via DCMI.
 func sampleSystemPower() float64 {
+	now := time.Now()
+	current := 0.0
 	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
-	if err != nil {
-		return 0
+	if err == nil {
+		current = parseDCMIPowerReading(string(out))
 	}
-	return parseDCMIPowerReading(string(out))
+	systemPowerCacheMu.Lock()
+	defer systemPowerCacheMu.Unlock()
+	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
+	systemPowerCache = updated
+	return value
 }

 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -431,6 +548,17 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }

+func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
+	if current > 0 {
+		cache = cachedPowerReading{Value: current, UpdatedAt: now}
+		return current, cache
+	}
+	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
+		return cache.Value, cache
+	}
+	return 0, cache
+}
+
 // analyzeThrottling returns true if any GPU reported an active throttle reason
 // during either load phase.
 func analyzeThrottling(rows []FanStressRow) bool {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -0,0 +1,67 @@
+package platform
+
+import (
+	"testing"
+	"time"
+)
+
+func TestParseFanSpeeds(t *testing.T) {
+	raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
+	got := parseFanSpeeds(raw)
+	if len(got) != 2 {
+		t.Fatalf("fans=%d want 2 (%v)", len(got), got)
+	}
+	if got[0].Name != "FAN1" || got[0].RPM != 2400 {
+		t.Fatalf("fan0=%+v", got[0])
+	}
+	if got[1].Name != "FAN2" || got[1].RPM != 1800 {
+		t.Fatalf("fan1=%+v", got[1])
+	}
+}
+
+func TestFirstFanInputValue(t *testing.T) {
+	feature := map[string]any{
+		"fan1_input": 9200.0,
+	}
+	got, ok := firstFanInputValue(feature)
+	if !ok || got != 9200 {
+		t.Fatalf("got=%v ok=%v", got, ok)
+	}
+}
+
+func TestParseDCMIPowerReading(t *testing.T) {
+	raw := `
+Instantaneous power reading:                   512 Watts
+Minimum during sampling period:               498 Watts
+`
+	if got := parseDCMIPowerReading(raw); got != 512 {
+		t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
+	}
+}
+
+func TestEffectiveSystemPowerReading(t *testing.T) {
+	now := time.Now()
+	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
+
+	got, updated := effectiveSystemPowerReading(cache, 0, now)
+	if got != 480 {
+		t.Fatalf("got=%v want cached 480", got)
+	}
+	if updated.Value != 480 {
+		t.Fatalf("updated=%+v", updated)
+	}
+
+	got, updated = effectiveSystemPowerReading(cache, 530, now)
+	if got != 530 {
+		t.Fatalf("got=%v want 530", got)
+	}
+	if updated.Value != 530 {
+		t.Fatalf("updated=%+v", updated)
+	}
+
+	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
+	got, _ = effectiveSystemPowerReading(expired, 0, now)
+	if got != 0 {
+		t.Fatalf("expired cache returned %v want 0", got)
+	}
+}
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -1,22 +1,25 @@
 package platform

 import (
+	"context"
 	"errors"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"strings"
 	"testing"
+	"time"
 )

 func TestStorageSATCommands(t *testing.T) {
 	t.Parallel()

-	nvme := storageSATCommands("/dev/nvme0n1")
+	nvme := storageSATCommands("/dev/nvme0n1", false)
 	if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
 		t.Fatalf("unexpected nvme commands: %#v", nvme)
 	}

-	sata := storageSATCommands("/dev/sda")
+	sata := storageSATCommands("/dev/sda", false)
 	if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
 		t.Fatalf("unexpected sata commands: %#v", sata)
 	}
@@ -27,24 +30,68 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {

 	jobs := nvidiaSATJobs()

-	if len(jobs) != 5 {
-		t.Fatalf("jobs=%d want 5", len(jobs))
+	if len(jobs) != 6 {
+		t.Fatalf("jobs=%d want 6", len(jobs))
 	}
-	if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
-		t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
+	if got := jobs[0].cmd[0]; got != "nvidia-smi" {
+		t.Fatalf("preflight command=%q want nvidia-smi", got)
 	}
-	if got := jobs[3].cmd[1]; got != "--output-file" {
+	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+	}
+	if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
+		t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
+	}
+	if got := jobs[4].cmd[1]; got != "--output-file" {
 		t.Fatalf("bug report flag=%q want --output-file", got)
 	}
 }

-func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
-	t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
-	t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
+func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
+	t.Parallel()

+	cfg := amdStressRVSConfig(123)
+	if !strings.Contains(cfg, "module: gst") {
+		t.Fatalf("config missing gst module:\n%s", cfg)
+	}
+	if strings.Contains(cfg, "module: mem") {
+		t.Fatalf("config should not include mem module:\n%s", cfg)
+	}
+	if !strings.Contains(cfg, "copy_matrix: false") {
+		t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
+	}
+	if strings.Count(cfg, "duration: 123000") != 1 {
+		t.Fatalf("config should apply duration once:\n%s", cfg)
+	}
+	for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
+		if !strings.Contains(cfg, field) {
+			t.Fatalf("config missing %s:\n%s", field, cfg)
+		}
+	}
+}
+
+func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
+	t.Parallel()
+
+	jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
+	if len(jobs) != 4 {
+		t.Fatalf("jobs=%d want 4", len(jobs))
+	}
+	if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
+		t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
+	}
+	if got := jobs[2].cmd[0]; got != "rvs" {
+		t.Fatalf("jobs[2]=%q want rvs", got)
+	}
+	if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
+		t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
+	}
+}
+
+func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	jobs := nvidiaSATJobs()
-	got := jobs[4].cmd
-	want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
+	got := jobs[5].cmd
+	want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
 	if len(got) != len(want) {
 		t.Fatalf("cmd len=%d want %d", len(got), len(want))
 	}
@@ -55,6 +102,257 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
+	jobs := nvidiaDCGMJobs(3, []int{2, 0})
+	if len(jobs) != 5 {
+		t.Fatalf("jobs=%d want 5", len(jobs))
+	}
+	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+	}
+	if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
+		t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
+	}
+}
+
+func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
+	t.Parallel()
+
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	job, err := buildNvidiaStressJob(NvidiaStressOptions{
+		DurationSec:       600,
+		Loader:            NvidiaStressLoaderJohn,
+		ExcludeGPUIndices: []int{1},
+	})
+	if err != nil {
+		t.Fatalf("buildNvidiaStressJob error: %v", err)
+	}
+	wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
+	if len(job.cmd) != len(wantCmd) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
+	}
+	for i := range wantCmd {
+		if job.cmd[i] != wantCmd[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
+		}
+	}
+	if got := joinIndexList(job.gpuIndices); got != "0,2" {
+		t.Fatalf("gpuIndices=%q want 0,2", got)
+	}
+}
+
+func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
+	t.Parallel()
+
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	job, err := buildNvidiaStressJob(NvidiaStressOptions{
+		DurationSec: 120,
+		Loader:      NvidiaStressLoaderNCCL,
+		GPUIndices:  []int{2, 0},
+	})
+	if err != nil {
+		t.Fatalf("buildNvidiaStressJob error: %v", err)
+	}
+	wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
+	if len(job.cmd) != len(wantCmd) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
+	}
+	for i := range wantCmd {
+		if job.cmd[i] != wantCmd[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
+		}
+	}
+	if got := joinIndexList(job.gpuIndices); got != "0,2" {
+		t.Fatalf("gpuIndices=%q want 0,2", got)
+	}
+}
+
+func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
+	t.Parallel()
+
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	got, err := resolveDCGMGPUIndices(nil)
+	if err != nil {
+		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
+	}
+	if want := "0,1,2"; joinIndexList(got) != want {
+		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
+	}
+}
+
+func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
+	t.Parallel()
+
+	got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
+	if err != nil {
+		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
+	}
+	if want := "1,3"; joinIndexList(got) != want {
+		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
+	}
+}
+
+func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
+	t.Parallel()
+
+	got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
+	if len(got) != 2 {
+		t.Fatalf("len=%d want 2", len(got))
+	}
+	if got[0].NeedsReset {
+		t.Fatalf("gpu0 unexpectedly marked reset-required")
+	}
+	if !got[1].NeedsReset {
+		t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
+	}
+}
+
+func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	msg, err := checkNvidiaJobHealth([]int{1})
+	if err == nil {
+		t.Fatal("expected health check error")
+	}
+	if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
+		t.Fatalf("unexpected message: %q", msg)
+	}
+}
+
+func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
+	dir := t.TempDir()
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	perGPU := map[int]*nvidiaGPUStatusFile{
+		0: {Index: 0, RunStatus: "OK"},
+		1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
+	}
+	if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
+		t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
+	}
+	raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
+	if err != nil {
+		t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
+	}
+	text := string(raw)
+	if !strings.Contains(text, "run_status=FAILED") {
+		t.Fatalf("missing run status:\n%s", text)
+	}
+	if !strings.Contains(text, "health_status=RESET_REQUIRED") {
+		t.Fatalf("missing health status:\n%s", text)
+	}
+	if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
+		t.Fatalf("missing failing job:\n%s", text)
+	}
+}
+
+func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
+	oldLookPath := satLookPath
+	satLookPath = func(file string) (string, error) {
+		switch file {
+		case "dcgmproftester13":
+			return "/usr/bin/dcgmproftester13", nil
+		default:
+			return "", exec.ErrNotFound
+		}
+	}
+	t.Cleanup(func() { satLookPath = oldLookPath })
+
+	cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
+	if err != nil {
+		t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
+	}
+	if len(cmd) != 4 {
+		t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
+	}
+	if cmd[0] != "/usr/bin/dcgmproftester13" {
+		t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
+	}
+}
+
+func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
+	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
+	want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
+	if len(cmd) != len(want) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
+	}
+	for i := range want {
+		if cmd[i] != want[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
+		}
+	}
+}
+
+func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
+	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
+	if len(env) != 2 {
+		t.Fatalf("env len=%d want 2 (%v)", len(env), env)
+	}
+	if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
+		t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
+	}
+	if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
+		t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
+	}
+}
+
+func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		loader string
+		want   string
+	}{
+		{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
+		{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
+		{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
+		{loader: "", want: "gpu-nvidia-burn"},
+	}
+	for _, tt := range tests {
+		if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
+			t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
+		}
+	}
+}
+
 func TestEnvIntFallback(t *testing.T) {
 	os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
 	if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
@@ -70,6 +368,37 @@ func TestEnvIntFallback(t *testing.T) {
 	}
 }

+func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+
+	if got := memoryStressSizeArg(); got != "65536M" {
+		t.Fatalf("sizeArg=%q want 65536M", got)
+	}
+}
+
+func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+	t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
+
+	if got := memoryStressSizeArg(); got != "4096M" {
+		t.Fatalf("sizeArg=%q want 4096M", got)
+	}
+}
+
+func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 0 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+
+	if got := memoryStressSizeArg(); got != "80%" {
+		t.Fatalf("sizeArg=%q want 80%%", got)
+	}
+}
+
 func TestClassifySATResult(t *testing.T) {
 	tests := []struct {
 		name   string
@@ -80,8 +409,9 @@ func TestClassifySATResult(t *testing.T) {
 	}{
 		{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
 		{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
-		{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
-		{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
+		{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
+		{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
+		{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 	}

 	for _, tt := range tests {
@@ -94,6 +424,38 @@ func TestClassifySATResult(t *testing.T) {
 	}
 }

+func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
+	dir := t.TempDir()
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+
+	done := make(chan struct{})
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		cancel()
+		close(done)
+	}()
+
+	archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
+		{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
+	}, nil)
+	<-done
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("err=%v want context.Canceled", err)
+	}
+	if archive != "" {
+		t.Fatalf("archive=%q want empty", archive)
+	}
+	matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
+	if globErr != nil {
+		t.Fatalf("Glob error: %v", globErr)
+	}
+	if len(matches) != 0 {
+		t.Fatalf("archives=%v want none", matches)
+	}
+}
+
 func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
 	t.Parallel()

@@ -130,6 +492,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
 	}
 }

+func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
+	oldLookPath := satLookPath
+	satLookPath = func(file string) (string, error) {
+		if file == "stress-ng" {
+			return "/usr/bin/stress-ng", nil
+		}
+		return "", exec.ErrNotFound
+	}
+	t.Cleanup(func() { satLookPath = oldLookPath })
+
+	cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
+	if err != nil {
+		t.Fatalf("resolveSATCommand error: %v", err)
+	}
+	if len(cmd) != 3 {
+		t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
+	}
+	if cmd[0] != "/usr/bin/stress-ng" {
+		t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
+	}
+}
+
+func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
+	oldLookPath := satLookPath
+	satLookPath = func(file string) (string, error) {
+		return "", exec.ErrNotFound
+	}
+	t.Cleanup(func() { satLookPath = oldLookPath })
+
+	_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
+		t.Fatalf("error=%q", err)
+	}
+}
+
 func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
 	tmp := t.TempDir()
 	execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -10,13 +10,30 @@ import (
 func (s *System) ListBeeServices() ([]string, error) {
 	seen := map[string]bool{}
 	var out []string
-	for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
+	for _, pattern := range []string{
+		"/etc/systemd/system/bee-*.service",
+		"/lib/systemd/system/bee-*.service",
+		"/etc/systemd/system/bee-*.timer",
+		"/lib/systemd/system/bee-*.timer",
+	} {
 		matches, err := filepath.Glob(pattern)
 		if err != nil {
 			return nil, err
 		}
 		for _, match := range matches {
-			name := strings.TrimSuffix(filepath.Base(match), ".service")
+			base := filepath.Base(match)
+			name := base
+			if strings.HasSuffix(base, ".service") {
+				name = strings.TrimSuffix(base, ".service")
+			}
+			// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
+			if strings.HasSuffix(name, "@") {
+				continue
+			}
+			// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
+			if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
+				continue
+			}
 			if !seen[name] {
 				seen[name] = true
 				out = append(out, name)
@@ -44,7 +61,9 @@ func (s *System) ServiceState(name string) string {
 }

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
-	raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
+	// bee-web runs as the bee user; sudo is required to control system services.
+	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
+	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
 	return string(raw), err
 }

--- a/audit/internal/platform/techdump.go
+++ b/audit/internal/platform/techdump.go
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
 	{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
 	{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
 	{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
+	{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
 	{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
 	{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
 	{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -2,12 +2,31 @@ package platform

 type System struct{}

+type LiveBootSource struct {
+	InRAM  bool   `json:"in_ram"`
+	Kind   string `json:"kind"`
+	Source string `json:"source,omitempty"`
+	Device string `json:"device,omitempty"`
+}
+
 type InterfaceInfo struct {
 	Name  string
 	State string
 	IPv4  []string
 }

+type NetworkInterfaceSnapshot struct {
+	Name string
+	Up   bool
+	IPv4 []string
+}
+
+type NetworkSnapshot struct {
+	Interfaces    []NetworkInterfaceSnapshot
+	DefaultRoutes []string
+	ResolvConf    string
+}
+
 type ServiceAction string

 const (
@@ -25,12 +44,12 @@ type StaticIPv4Config struct {
 }

 type RemovableTarget struct {
-	Device     string
-	FSType     string
-	Size       string
-	Label      string
-	Model      string
-	Mountpoint string
+	Device     string `json:"device"`
+	FSType     string `json:"fs_type"`
+	Size       string `json:"size"`
+	Label      string `json:"label"`
+	Model      string `json:"model"`
+	Mountpoint string `json:"mountpoint"`
 }

 type ToolStatus struct {
@@ -39,6 +58,21 @@ type ToolStatus struct {
 	OK   bool
 }

+const (
+	NvidiaStressLoaderBuiltin = "builtin"
+	NvidiaStressLoaderJohn    = "john"
+	NvidiaStressLoaderNCCL    = "nccl"
+)
+
+type NvidiaStressOptions struct {
+	DurationSec       int
+	SizeMB            int
+	Loader            string
+	GPUIndices        []int
+	ExcludeGPUIndices []int
+	StaggerSeconds    int
+}
+
 func New() *System {
 	return &System{}
 }
--- a/audit/internal/platform/types_test.go
+++ b/audit/internal/platform/types_test.go
@@ -0,0 +1,31 @@
+package platform
+
+import (
+	"encoding/json"
+	"strings"
+	"testing"
+)
+
+func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
+	t.Parallel()
+
+	data, err := json.Marshal(RemovableTarget{
+		Device: "/dev/sdb1",
+		FSType: "exfat",
+		Size:   "1.8T",
+		Label:  "USB",
+		Model:  "Flash",
+	})
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	raw := string(data)
+	for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
+		if !strings.Contains(raw, key) {
+			t.Fatalf("json missing key %s: %s", key, raw)
+		}
+	}
+	if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
+		t.Fatalf("json still contains Go field names: %s", raw)
+	}
+}
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -20,7 +20,12 @@ type RuntimeHealth struct {
 	ExportDir     string                 `json:"export_dir,omitempty"`
 	DriverReady   bool                   `json:"driver_ready,omitempty"`
 	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
+	NvidiaGSPMode string                 `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string                 `json:"network_status,omitempty"`
+	// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
+	ToRAMStatus   string `json:"toram_status,omitempty"`
+	// USBExportPath: mount point of the first writable USB drive found, empty if none.
+	USBExportPath string `json:"usb_export_path,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
 	Services      []RuntimeServiceStatus `json:"services,omitempty"`
@@ -182,6 +187,13 @@ type HardwarePCIeDevice struct {
 	BatteryTemperatureC    *float64       `json:"battery_temperature_c,omitempty"`
 	BatteryVoltageV        *float64       `json:"battery_voltage_v,omitempty"`
 	BatteryReplaceRequired *bool          `json:"battery_replace_required,omitempty"`
+	SFPPresent             *bool          `json:"sfp_present,omitempty"`
+	SFPIdentifier          *string        `json:"sfp_identifier,omitempty"`
+	SFPConnector           *string        `json:"sfp_connector,omitempty"`
+	SFPVendor              *string        `json:"sfp_vendor,omitempty"`
+	SFPPartNumber          *string        `json:"sfp_part_number,omitempty"`
+	SFPSerialNumber        *string        `json:"sfp_serial_number,omitempty"`
+	SFPWavelengthNM        *float64       `json:"sfp_wavelength_nm,omitempty"`
 	SFPTemperatureC        *float64       `json:"sfp_temperature_c,omitempty"`
 	SFPTXPowerDBM          *float64       `json:"sfp_tx_power_dbm,omitempty"`
 	SFPRXPowerDBM          *float64       `json:"sfp_rx_power_dbm,omitempty"`
--- a/audit/internal/tui/forms.go
+++ b/audit/internal/tui/forms.go
@@ -1,203 +0,0 @@
-package tui
-
-import (
-	"time"
-
-	"bee/audit/internal/platform"
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-func (m model) updateStaticForm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	switch msg.String() {
-	case "esc":
-		m.screen = screenNetwork
-		m.formFields = nil
-		m.formIndex = 0
-		return m, nil
-	case "up", "shift+tab":
-		if m.formIndex > 0 {
-			m.formIndex--
-		}
-	case "down", "tab":
-		if m.formIndex < len(m.formFields)-1 {
-			m.formIndex++
-		}
-	case "enter":
-		if m.formIndex < len(m.formFields)-1 {
-			m.formIndex++
-			return m, nil
-		}
-		cfg := m.app.ParseStaticIPv4Config(m.selectedIface, []string{
-			m.formFields[0].Value,
-			m.formFields[1].Value,
-			m.formFields[2].Value,
-			m.formFields[3].Value,
-		})
-		m.busy = true
-		m.busyTitle = "Static IPv4: " + m.selectedIface
-		return m, func() tea.Msg {
-			result, err := m.app.SetStaticIPv4Result(cfg)
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
-		}
-	case "backspace":
-		field := &m.formFields[m.formIndex]
-		if len(field.Value) > 0 {
-			field.Value = field.Value[:len(field.Value)-1]
-		}
-	default:
-		if msg.Type == tea.KeyRunes && len(msg.Runes) > 0 {
-			m.formFields[m.formIndex].Value += string(msg.Runes)
-		}
-	}
-	return m, nil
-}
-
-func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	switch msg.String() {
-	case "left", "up", "tab":
-		if m.cursor > 0 {
-			m.cursor--
-		}
-	case "right", "down":
-		if m.cursor < 1 {
-			m.cursor++
-		}
-	case "esc":
-		m.screen = m.confirmCancelTarget()
-		m.cursor = 0
-		m.pendingAction = actionNone
-		return m, nil
-	case "enter":
-		if m.cursor == 1 { // Cancel
-			m.screen = m.confirmCancelTarget()
-			m.cursor = 0
-			m.pendingAction = actionNone
-			return m, nil
-		}
-		m.busy = true
-		switch m.pendingAction {
-		case actionExportBundle:
-			m.busyTitle = "Export support bundle"
-			target := *m.selectedTarget
-			return m, func() tea.Msg {
-				result, err := m.app.ExportSupportBundleResult(target)
-				return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
-			}
-		case actionRunAll:
-			return m.executeRunAll()
-		case actionRunMemorySAT:
-			m.busyTitle = "Memory test"
-			m.progressPrefix = "memory"
-			m.progressSince = time.Now()
-			m.progressLines = nil
-			since := m.progressSince
-			return m, tea.Batch(
-				func() tea.Msg {
-					result, err := m.app.RunMemoryAcceptancePackResult("")
-					return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
-				},
-				pollSATProgress("memory", since),
-			)
-		case actionRunStorageSAT:
-			m.busyTitle = "Storage test"
-			m.progressPrefix = "storage"
-			m.progressSince = time.Now()
-			m.progressLines = nil
-			since := m.progressSince
-			return m, tea.Batch(
-				func() tea.Msg {
-					result, err := m.app.RunStorageAcceptancePackResult("")
-					return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
-				},
-				pollSATProgress("storage", since),
-			)
-		case actionRunCPUSAT:
-			m.busyTitle = "CPU test"
-			m.progressPrefix = "cpu"
-			m.progressSince = time.Now()
-			m.progressLines = nil
-			since := m.progressSince
-			durationSec := hcCPUDurations[m.hcMode]
-			return m, tea.Batch(
-				func() tea.Msg {
-					result, err := m.app.RunCPUAcceptancePackResult("", durationSec)
-					return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
-				},
-				pollSATProgress("cpu", since),
-			)
-		case actionRunAMDGPUSAT:
-			m.busyTitle = "AMD GPU test"
-			m.progressPrefix = "gpu-amd"
-			m.progressSince = time.Now()
-			m.progressLines = nil
-			since := m.progressSince
-			return m, tea.Batch(
-				func() tea.Msg {
-					result, err := m.app.RunAMDAcceptancePackResult("")
-					return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
-				},
-				pollSATProgress("gpu-amd", since),
-			)
-		case actionRunFanStress:
-			return m.startGPUStressTest()
-		}
-	case "ctrl+c":
-		return m, tea.Quit
-	}
-	return m, nil
-}
-
-func (m model) confirmCancelTarget() screen {
-	switch m.pendingAction {
-	case actionExportBundle:
-		return screenExportTargets
-	case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
-		return screenHealthCheck
-	default:
-		return screenMain
-	}
-}
-
-// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs.
-func hcFanStressOpts(hcMode int, application interface {
-	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
-}) platform.FanStressOptions {
-	// Phase durations per mode: [baseline, load1, pause, load2]
-	type durations struct{ baseline, load1, pause, load2 int }
-	modes := [3]durations{
-		{30, 120, 30, 120},   // Quick:    ~5 min total
-		{60, 300, 60, 300},   // Standard: ~12 min total
-		{60, 600, 120, 600},  // Express:  ~24 min total
-	}
-	if hcMode < 0 || hcMode >= len(modes) {
-		hcMode = 0
-	}
-	d := modes[hcMode]
-
-	// Use all detected NVIDIA GPUs.
-	var indices []int
-	if gpus, err := application.ListNvidiaGPUs(); err == nil {
-		for _, g := range gpus {
-			indices = append(indices, g.Index)
-		}
-	}
-
-	// Use minimum GPU memory size to fit all GPUs.
-	sizeMB := 64
-	if gpus, err := application.ListNvidiaGPUs(); err == nil {
-		for _, g := range gpus {
-			if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
-				sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
-			}
-		}
-	}
-
-	return platform.FanStressOptions{
-		BaselineSec:  d.baseline,
-		Phase1DurSec: d.load1,
-		PauseSec:     d.pause,
-		Phase2DurSec: d.load2,
-		SizeMB:       sizeMB,
-		GPUIndices:   indices,
-	}
-}
--- a/audit/internal/tui/messages.go
+++ b/audit/internal/tui/messages.go
@@ -1,52 +0,0 @@
-package tui
-
-import (
-	"bee/audit/internal/app"
-	"bee/audit/internal/platform"
-)
-
-type resultMsg struct {
-	title string
-	body  string
-	err   error
-	back  screen
-}
-
-type servicesMsg struct {
-	services []string
-	err      error
-}
-
-type interfacesMsg struct {
-	ifaces []platform.InterfaceInfo
-	err    error
-}
-
-type exportTargetsMsg struct {
-	targets []platform.RemovableTarget
-	err     error
-}
-
-type snapshotMsg struct {
-	banner string
-	panel  app.HardwarePanelData
-}
-
-type nvidiaGPUsMsg struct {
-	gpus []platform.NvidiaGPU
-	err  error
-}
-
-type nvtopClosedMsg struct{}
-
-type nvidiaSATDoneMsg struct {
-	title string
-	body  string
-	err   error
-}
-
-type gpuStressDoneMsg struct {
-	title string
-	body  string
-	err   error
-}
--- a/audit/internal/tui/sat_progress.go
+++ b/audit/internal/tui/sat_progress.go
@@ -1,131 +0,0 @@
-package tui
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-	"sort"
-	"strconv"
-	"strings"
-	"time"
-
-	"bee/audit/internal/app"
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-type satProgressMsg struct {
-	lines []string
-}
-
-// pollSATProgress returns a Cmd that waits 300ms then reads the latest verbose.log
-// for the given SAT prefix and returns parsed step progress lines.
-func pollSATProgress(prefix string, since time.Time) tea.Cmd {
-	return tea.Tick(300*time.Millisecond, func(_ time.Time) tea.Msg {
-		return satProgressMsg{lines: readSATProgressLines(prefix, since)}
-	})
-}
-
-func readSATProgressLines(prefix string, since time.Time) []string {
-	pattern := filepath.Join(app.DefaultSATBaseDir, prefix+"-*/verbose.log")
-	matches, err := filepath.Glob(pattern)
-	if err != nil || len(matches) == 0 {
-		return nil
-	}
-	sort.Strings(matches)
-	// Find the latest file created at or after (since - 5s) to account for clock skew.
-	cutoff := since.Add(-5 * time.Second)
-	candidate := ""
-	for _, m := range matches {
-		info, statErr := os.Stat(m)
-		if statErr == nil && info.ModTime().After(cutoff) {
-			candidate = m
-		}
-	}
-	if candidate == "" {
-		return nil
-	}
-	raw, err := os.ReadFile(candidate)
-	if err != nil {
-		return nil
-	}
-	return parseSATVerboseProgress(string(raw))
-}
-
-// parseSATVerboseProgress parses verbose.log content and returns display lines like:
-//
-//	"PASS  lscpu (234ms)"
-//	"FAIL  stress-ng (60.0s)"
-//	"...   sensors-after"
-func parseSATVerboseProgress(content string) []string {
-	type step struct {
-		name       string
-		rc         int
-		durationMs int
-		done       bool
-	}
-
-	lines := strings.Split(content, "\n")
-	var steps []step
-	stepIdx := map[string]int{}
-
-	for i, line := range lines {
-		line = strings.TrimSpace(line)
-		if idx := strings.Index(line, "] start "); idx >= 0 {
-			name := strings.TrimSpace(line[idx+len("] start "):])
-			if _, exists := stepIdx[name]; !exists {
-				stepIdx[name] = len(steps)
-				steps = append(steps, step{name: name})
-			}
-		} else if idx := strings.Index(line, "] finish "); idx >= 0 {
-			name := strings.TrimSpace(line[idx+len("] finish "):])
-			si, exists := stepIdx[name]
-			if !exists {
-				continue
-			}
-			steps[si].done = true
-			for j := i + 1; j < len(lines) && j <= i+3; j++ {
-				l := strings.TrimSpace(lines[j])
-				if strings.HasPrefix(l, "rc: ") {
-					steps[si].rc, _ = strconv.Atoi(strings.TrimPrefix(l, "rc: "))
-				} else if strings.HasPrefix(l, "duration_ms: ") {
-					steps[si].durationMs, _ = strconv.Atoi(strings.TrimPrefix(l, "duration_ms: "))
-				}
-			}
-		}
-	}
-
-	var result []string
-	for _, s := range steps {
-		display := cleanSATStepName(s.name)
-		if s.done {
-			status := "PASS"
-			if s.rc != 0 {
-				status = "FAIL"
-			}
-			result = append(result, fmt.Sprintf("%-4s  %s (%s)", status, display, fmtDurMs(s.durationMs)))
-		} else {
-			result = append(result, fmt.Sprintf("...   %s", display))
-		}
-	}
-	return result
-}
-
-// cleanSATStepName strips leading digits and dash: "01-lscpu.log" → "lscpu".
-func cleanSATStepName(name string) string {
-	name = strings.TrimSuffix(name, ".log")
-	i := 0
-	for i < len(name) && name[i] >= '0' && name[i] <= '9' {
-		i++
-	}
-	if i < len(name) && name[i] == '-' {
-		name = name[i+1:]
-	}
-	return name
-}
-
-func fmtDurMs(ms int) string {
-	if ms < 1000 {
-		return fmt.Sprintf("%dms", ms)
-	}
-	return fmt.Sprintf("%.1fs", float64(ms)/1000)
-}
--- a/audit/internal/tui/screen_export.go
+++ b/audit/internal/tui/screen_export.go
@@ -1,14 +0,0 @@
-package tui
-
-import tea "github.com/charmbracelet/bubbletea"
-
-func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
-	if len(m.targets) == 0 {
-		return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain)
-	}
-	target := m.targets[m.cursor]
-	m.selectedTarget = &target
-	m.pendingAction = actionExportBundle
-	m.screen = screenConfirm
-	return m, nil
-}
--- a/audit/internal/tui/screen_health_check.go
+++ b/audit/internal/tui/screen_health_check.go
@@ -1,386 +0,0 @@
-package tui
-
-import (
-	"context"
-	"fmt"
-	"os/exec"
-	"strings"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-// Component indices.
-const (
-	hcGPU     = 0
-	hcMemory  = 1
-	hcStorage = 2
-	hcCPU     = 3
-)
-
-// Cursor positions in Health Check screen.
-const (
-	hcCurGPU        = 0
-	hcCurMemory     = 1
-	hcCurStorage    = 2
-	hcCurCPU        = 3
-	hcCurSelectAll  = 4
-	hcCurModeQuick  = 5
-	hcCurModeStd    = 6
-	hcCurModeExpr   = 7
-	hcCurRunAll     = 8
-	hcCurFanStress  = 9
-	hcCurTotal      = 10
-)
-
-// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
-var hcModeDurations = [3]int{600, 3600, 28800}
-
-// hcCPUDurations maps mode index to CPU stress-ng seconds.
-var hcCPUDurations = [3]int{60, 300, 900}
-
-func (m model) enterHealthCheck() (tea.Model, tea.Cmd) {
-	m.screen = screenHealthCheck
-	if !m.hcInitialized {
-		m.hcSel = [4]bool{true, true, true, true}
-		m.hcMode = 0
-		m.hcCursor = 0
-		m.hcInitialized = true
-	}
-	return m, nil
-}
-
-func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	switch msg.String() {
-	case "up", "k":
-		if m.hcCursor > 0 {
-			m.hcCursor--
-		}
-	case "down", "j":
-		if m.hcCursor < hcCurTotal-1 {
-			m.hcCursor++
-		}
-	case " ":
-		switch m.hcCursor {
-		case hcCurGPU, hcCurMemory, hcCurStorage, hcCurCPU:
-			m.hcSel[m.hcCursor] = !m.hcSel[m.hcCursor]
-		case hcCurSelectAll:
-			allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
-			for i := range m.hcSel {
-				m.hcSel[i] = !allOn
-			}
-		case hcCurModeQuick, hcCurModeStd, hcCurModeExpr:
-			m.hcMode = m.hcCursor - hcCurModeQuick
-		}
-	case "enter":
-		switch m.hcCursor {
-		case hcCurGPU, hcCurMemory, hcCurStorage, hcCurCPU:
-			return m.hcRunSingle(m.hcCursor)
-		case hcCurSelectAll:
-			allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
-			for i := range m.hcSel {
-				m.hcSel[i] = !allOn
-			}
-		case hcCurModeQuick, hcCurModeStd, hcCurModeExpr:
-			m.hcMode = m.hcCursor - hcCurModeQuick
-		case hcCurRunAll:
-			return m.hcRunAll()
-		case hcCurFanStress:
-			return m.hcRunFanStress()
-		}
-	case "g", "G":
-		return m.hcRunSingle(hcGPU)
-	case "m", "M":
-		return m.hcRunSingle(hcMemory)
-	case "s", "S":
-		return m.hcRunSingle(hcStorage)
-	case "c", "C":
-		return m.hcRunSingle(hcCPU)
-	case "r", "R":
-		return m.hcRunAll()
-	case "f", "F":
-		return m.hcRunFanStress()
-	case "a", "A":
-		allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
-		for i := range m.hcSel {
-			m.hcSel[i] = !allOn
-		}
-	case "1":
-		m.hcMode = 0
-	case "2":
-		m.hcMode = 1
-	case "3":
-		m.hcMode = 2
-	case "esc":
-		m.screen = screenMain
-		m.cursor = 0
-	case "q", "ctrl+c":
-		return m, tea.Quit
-	}
-	return m, nil
-}
-
-func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
-	switch idx {
-	case hcGPU:
-		if m.app.DetectGPUVendor() == "amd" {
-			m.pendingAction = actionRunAMDGPUSAT
-			m.screen = screenConfirm
-			m.cursor = 0
-			return m, nil
-		}
-		m.nvidiaDurIdx = m.hcMode
-		return m.enterNvidiaSATSetup()
-	case hcMemory:
-		m.pendingAction = actionRunMemorySAT
-		m.screen = screenConfirm
-		m.cursor = 0
-		return m, nil
-	case hcStorage:
-		m.pendingAction = actionRunStorageSAT
-		m.screen = screenConfirm
-		m.cursor = 0
-		return m, nil
-	case hcCPU:
-		m.pendingAction = actionRunCPUSAT
-		m.screen = screenConfirm
-		m.cursor = 0
-		return m, nil
-	}
-	return m, nil
-}
-
-func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
-	m.pendingAction = actionRunFanStress
-	m.screen = screenConfirm
-	m.cursor = 0
-	return m, nil
-}
-
-// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
-// nvtop occupies the full terminal as a live chart; the stress test runs in background.
-func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
-	opts := hcFanStressOpts(m.hcMode, m.app)
-
-	ctx, cancel := context.WithCancel(context.Background())
-	m.gpuStressCancel = cancel
-	m.gpuStressAborted = false
-	m.screen = screenGPUStressRunning
-	m.nvidiaSATCursor = 0
-
-	stressCmd := func() tea.Msg {
-		result, err := m.app.RunFanStressTestResult(ctx, opts)
-		return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
-	}
-
-	nvtopPath, lookErr := exec.LookPath("nvtop")
-	if lookErr != nil {
-		return m, stressCmd
-	}
-
-	return m, tea.Batch(
-		stressCmd,
-		tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		}),
-	)
-}
-
-// updateGPUStressRunning handles keys on the GPU stress running screen.
-func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	switch msg.String() {
-	case "o", "O":
-		nvtopPath, err := exec.LookPath("nvtop")
-		if err != nil {
-			return m, nil
-		}
-		return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		})
-	case "a", "A":
-		if m.gpuStressCancel != nil {
-			m.gpuStressCancel()
-			m.gpuStressCancel = nil
-		}
-		m.gpuStressAborted = true
-		m.screen = screenHealthCheck
-		m.cursor = 0
-	case "ctrl+c":
-		return m, tea.Quit
-	}
-	return m, nil
-}
-
-func renderGPUStressRunning() string {
-	return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop  [a] Abort test  [ctrl+c] quit\n"
-}
-
-func (m model) hcRunAll() (tea.Model, tea.Cmd) {
-	for _, sel := range m.hcSel {
-		if sel {
-			m.pendingAction = actionRunAll
-			m.screen = screenConfirm
-			m.cursor = 0
-			return m, nil
-		}
-	}
-	return m, nil
-}
-
-func (m model) executeRunAll() (tea.Model, tea.Cmd) {
-	durationSec := hcModeDurations[m.hcMode]
-	durationIdx := m.hcMode
-	sel := m.hcSel
-	app := m.app
-	m.busy = true
-	m.busyTitle = "Health Check"
-	return m, func() tea.Msg {
-		var parts []string
-		if sel[hcGPU] {
-			vendor := app.DetectGPUVendor()
-			if vendor == "amd" {
-				r, err := app.RunAMDAcceptancePackResult("")
-				body := r.Body
-				if err != nil {
-					body += "\nERROR: " + err.Error()
-				}
-				parts = append(parts, "=== GPU (AMD) ===\n"+body)
-			} else {
-				gpus, err := app.ListNvidiaGPUs()
-				if err != nil || len(gpus) == 0 {
-					parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.")
-				} else {
-					var indices []int
-					sizeMB := 0
-					for _, g := range gpus {
-						indices = append(indices, g.Index)
-						if sizeMB == 0 || g.MemoryMB < sizeMB {
-							sizeMB = g.MemoryMB
-						}
-					}
-					if sizeMB == 0 {
-						sizeMB = 64
-					}
-					r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices)
-					body := r.Body
-					if err != nil {
-						body += "\nERROR: " + err.Error()
-					}
-					parts = append(parts, "=== GPU ===\n"+body)
-				}
-			}
-		}
-		if sel[hcMemory] {
-			r, err := app.RunMemoryAcceptancePackResult("")
-			body := r.Body
-			if err != nil {
-				body += "\nERROR: " + err.Error()
-			}
-			parts = append(parts, "=== MEMORY ===\n"+body)
-		}
-		if sel[hcStorage] {
-			r, err := app.RunStorageAcceptancePackResult("")
-			body := r.Body
-			if err != nil {
-				body += "\nERROR: " + err.Error()
-			}
-			parts = append(parts, "=== STORAGE ===\n"+body)
-		}
-		if sel[hcCPU] {
-			cpuDur := hcCPUDurations[durationIdx]
-			r, err := app.RunCPUAcceptancePackResult("", cpuDur)
-			body := r.Body
-			if err != nil {
-				body += "\nERROR: " + err.Error()
-			}
-			parts = append(parts, "=== CPU ===\n"+body)
-		}
-		combined := strings.Join(parts, "\n\n")
-		if combined == "" {
-			combined = "No components selected."
-		}
-		return resultMsg{title: "Health Check", body: combined, back: screenHealthCheck}
-	}
-}
-
-func renderHealthCheck(m model) string {
-	var b strings.Builder
-
-	fmt.Fprintln(&b, "HEALTH CHECK")
-	fmt.Fprintln(&b)
-	fmt.Fprintln(&b, "  Diagnostics:")
-	fmt.Fprintln(&b)
-
-	type comp struct{ name, desc, key string }
-	comps := []comp{
-		{"GPU", "nvidia/amd auto-detect", "G"},
-		{"MEMORY", "memtester", "M"},
-		{"STORAGE", "smartctl + NVMe self-test", "S"},
-		{"CPU", "audit diagnostics", "C"},
-	}
-	for i, c := range comps {
-		pfx := "  "
-		if m.hcCursor == i {
-			pfx = "> "
-		}
-		ch := "[ ]"
-		if m.hcSel[i] {
-			ch = "[x]"
-		}
-		fmt.Fprintf(&b, "%s%s  %-8s  %-28s [%s]\n", pfx, ch, c.name, c.desc, c.key)
-	}
-
-	fmt.Fprintln(&b, "  ─────────────────────────────────────────────────")
-	{
-		pfx := "  "
-		if m.hcCursor == hcCurSelectAll {
-			pfx = "> "
-		}
-		allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
-		ch := "[ ]"
-		if allOn {
-			ch = "[x]"
-		}
-		fmt.Fprintf(&b, "%s%s  Select / Deselect All                        [A]\n", pfx, ch)
-	}
-
-	fmt.Fprintln(&b)
-	fmt.Fprintln(&b, "  Mode:")
-	modes := []struct{ label, key string }{
-		{"Quick", "1"},
-		{"Standard", "2"},
-		{"Express", "3"},
-	}
-	for i, mode := range modes {
-		pfx := "  "
-		if m.hcCursor == hcCurModeQuick+i {
-			pfx = "> "
-		}
-		radio := "( )"
-		if m.hcMode == i {
-			radio = "(*)"
-		}
-		fmt.Fprintf(&b, "%s%s  %-10s  [%s]\n", pfx, radio, mode.label, mode.key)
-	}
-
-	fmt.Fprintln(&b)
-	{
-		pfx := "  "
-		if m.hcCursor == hcCurRunAll {
-			pfx = "> "
-		}
-		fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
-	}
-
-	{
-		pfx := "  "
-		if m.hcCursor == hcCurFanStress {
-			pfx = "> "
-		}
-		fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ]   (thermal cycling, fan lag, throttle check)\n", pfx)
-	}
-
-	fmt.Fprintln(&b)
-	fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
-	fmt.Fprint(&b, "[↑↓] move  [space/enter] toggle  [letter] single test  [R] run all  [F] gpu stress  [Esc] back")
-	return b.String()
-}
--- a/audit/internal/tui/screen_main.go
+++ b/audit/internal/tui/screen_main.go
@@ -1,27 +0,0 @@
-package tui
-
-import (
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
-	switch m.cursor {
-	case 0: // Health Check
-		return m.enterHealthCheck()
-	case 1: // Export support bundle
-		m.pendingAction = actionExportBundle
-		m.busy = true
-		m.busyTitle = "Export support bundle"
-		return m, func() tea.Msg {
-			targets, err := m.app.ListRemovableTargets()
-			return exportTargetsMsg{targets: targets, err: err}
-		}
-	case 2: // Settings
-		m.screen = screenSettings
-		m.cursor = 0
-		return m, nil
-	case 3: // Exit
-		return m, tea.Quit
-	}
-	return m, nil
-}
--- a/audit/internal/tui/screen_network.go
+++ b/audit/internal/tui/screen_network.go
@@ -1,76 +0,0 @@
-package tui
-
-import (
-	"strings"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-func (m model) handleNetworkMenu() (tea.Model, tea.Cmd) {
-	switch m.cursor {
-	case 0:
-		m.busy = true
-		m.busyTitle = "Network status"
-		return m, func() tea.Msg {
-			result, err := m.app.NetworkStatus()
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
-		}
-	case 1:
-		m.busy = true
-		m.busyTitle = "DHCP all interfaces"
-		return m, func() tea.Msg {
-			result, err := m.app.DHCPAllResult()
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
-		}
-	case 2:
-		m.pendingAction = actionDHCPOne
-		m.busy = true
-		m.busyTitle = "Interfaces"
-		return m, func() tea.Msg {
-			ifaces, err := m.app.ListInterfaces()
-			return interfacesMsg{ifaces: ifaces, err: err}
-		}
-	case 3:
-		m.pendingAction = actionStaticIPv4
-		m.busy = true
-		m.busyTitle = "Interfaces"
-		return m, func() tea.Msg {
-			ifaces, err := m.app.ListInterfaces()
-			return interfacesMsg{ifaces: ifaces, err: err}
-		}
-	case 4:
-		m.screen = screenSettings
-		m.cursor = 0
-		return m, nil
-	}
-	return m, nil
-}
-
-func (m model) handleInterfacePickMenu() (tea.Model, tea.Cmd) {
-	if len(m.interfaces) == 0 {
-		return m, resultCmd("interfaces", "No physical interfaces found", nil, screenNetwork)
-	}
-	m.selectedIface = m.interfaces[m.cursor].Name
-	switch m.pendingAction {
-	case actionDHCPOne:
-		m.busy = true
-		m.busyTitle = "DHCP on " + m.selectedIface
-		return m, func() tea.Msg {
-			result, err := m.app.DHCPOneResult(m.selectedIface)
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
-		}
-	case actionStaticIPv4:
-		defaults := m.app.DefaultStaticIPv4FormFields(m.selectedIface)
-		m.formFields = []formField{
-			{Label: "IPv4 address", Value: defaults[0]},
-			{Label: "Prefix", Value: defaults[1]},
-			{Label: "Gateway", Value: strings.TrimSpace(defaults[2])},
-			{Label: "DNS (space-separated)", Value: defaults[3]},
-		}
-		m.formIndex = 0
-		m.screen = screenStaticForm
-		return m, nil
-	default:
-		return m, nil
-	}
-}
--- a/audit/internal/tui/screen_nvidia_sat.go
+++ b/audit/internal/tui/screen_nvidia_sat.go
@@ -1,218 +0,0 @@
-package tui
-
-import (
-	"context"
-	"fmt"
-	"strings"
-
-	"bee/audit/internal/platform"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-var nvidiaDurationOptions = []struct {
-	label   string
-	seconds int
-}{
-	{"10 minutes", 600},
-	{"1 hour", 3600},
-	{"8 hours", 28800},
-	{"24 hours", 86400},
-}
-
-// enterNvidiaSATSetup resets the setup screen and starts loading GPU list.
-func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) {
-	m.screen = screenNvidiaSATSetup
-	m.nvidiaGPUs = nil
-	m.nvidiaGPUSel = nil
-	m.nvidiaDurIdx = 0
-	m.nvidiaSATCursor = 0
-	m.busy = true
-	m.busyTitle = "NVIDIA SAT"
-	return m, func() tea.Msg {
-		gpus, err := m.app.ListNvidiaGPUs()
-		return nvidiaGPUsMsg{gpus: gpus, err: err}
-	}
-}
-
-// handleNvidiaGPUsMsg processes the GPU list response.
-func (m model) handleNvidiaGPUsMsg(msg nvidiaGPUsMsg) (tea.Model, tea.Cmd) {
-	m.busy = false
-	m.busyTitle = ""
-	if msg.err != nil {
-		m.title = "NVIDIA SAT"
-		m.body = fmt.Sprintf("Failed to list GPUs: %v", msg.err)
-		m.prevScreen = screenHealthCheck
-		m.screen = screenOutput
-		return m, nil
-	}
-	m.nvidiaGPUs = msg.gpus
-	m.nvidiaGPUSel = make([]bool, len(msg.gpus))
-	for i := range m.nvidiaGPUSel {
-		m.nvidiaGPUSel[i] = true // all selected by default
-	}
-	m.nvidiaSATCursor = 0
-	return m, nil
-}
-
-// updateNvidiaSATSetup handles keys on the setup screen.
-func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	numDur := len(nvidiaDurationOptions)
-	numGPU := len(m.nvidiaGPUs)
-	totalItems := numDur + numGPU + 2 // +2: Start, Cancel
-	switch msg.String() {
-	case "up", "k":
-		if m.nvidiaSATCursor > 0 {
-			m.nvidiaSATCursor--
-		}
-	case "down", "j":
-		if m.nvidiaSATCursor < totalItems-1 {
-			m.nvidiaSATCursor++
-		}
-	case " ":
-		switch {
-		case m.nvidiaSATCursor < numDur:
-			m.nvidiaDurIdx = m.nvidiaSATCursor
-		case m.nvidiaSATCursor < numDur+numGPU:
-			i := m.nvidiaSATCursor - numDur
-			m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
-		}
-	case "enter":
-		startIdx := numDur + numGPU
-		cancelIdx := startIdx + 1
-		switch {
-		case m.nvidiaSATCursor < numDur:
-			m.nvidiaDurIdx = m.nvidiaSATCursor
-		case m.nvidiaSATCursor < startIdx:
-			i := m.nvidiaSATCursor - numDur
-			m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
-		case m.nvidiaSATCursor == startIdx:
-			return m.startNvidiaSAT()
-		case m.nvidiaSATCursor == cancelIdx:
-			m.screen = screenHealthCheck
-			m.cursor = 0
-		}
-	case "esc":
-		m.screen = screenHealthCheck
-		m.cursor = 0
-	case "ctrl+c", "q":
-		return m, tea.Quit
-	}
-	return m, nil
-}
-
-// startNvidiaSAT launches the NVIDIA acceptance pack.
-func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
-	var selectedGPUs []platform.NvidiaGPU
-	for i, sel := range m.nvidiaGPUSel {
-		if sel {
-			selectedGPUs = append(selectedGPUs, m.nvidiaGPUs[i])
-		}
-	}
-	if len(selectedGPUs) == 0 {
-		selectedGPUs = m.nvidiaGPUs // fallback: use all if none explicitly selected
-	}
-
-	sizeMB := 0
-	for _, g := range selectedGPUs {
-		if sizeMB == 0 || g.MemoryMB < sizeMB {
-			sizeMB = g.MemoryMB
-		}
-	}
-	if sizeMB == 0 {
-		sizeMB = 64
-	}
-
-	var gpuIndices []int
-	for _, g := range selectedGPUs {
-		gpuIndices = append(gpuIndices, g.Index)
-	}
-
-	durationSec := nvidiaDurationOptions[m.nvidiaDurIdx].seconds
-
-	ctx, cancel := context.WithCancel(context.Background())
-	m.nvidiaSATCancel = cancel
-	m.nvidiaSATAborted = false
-	m.screen = screenNvidiaSATRunning
-	m.nvidiaSATCursor = 0
-
-	satCmd := func() tea.Msg {
-		result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", durationSec, sizeMB, gpuIndices)
-		return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
-	}
-
-	return m, satCmd
-}
-
-// updateNvidiaSATRunning handles keys on the running screen.
-func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	switch msg.String() {
-	case "a", "A":
-		if m.nvidiaSATCancel != nil {
-			m.nvidiaSATCancel()
-			m.nvidiaSATCancel = nil
-		}
-		m.nvidiaSATAborted = true
-		m.screen = screenHealthCheck
-		m.cursor = 0
-	case "ctrl+c":
-		return m, tea.Quit
-	}
-	return m, nil
-}
-
-// renderNvidiaSATSetup renders the setup screen.
-func renderNvidiaSATSetup(m model) string {
-	var b strings.Builder
-	fmt.Fprintln(&b, "NVIDIA SAT")
-	fmt.Fprintln(&b)
-	fmt.Fprintln(&b, "Duration:")
-	for i, opt := range nvidiaDurationOptions {
-		radio := "( )"
-		if i == m.nvidiaDurIdx {
-			radio = "(*)"
-		}
-		prefix := "  "
-		if m.nvidiaSATCursor == i {
-			prefix = "> "
-		}
-		fmt.Fprintf(&b, "%s%s %s\n", prefix, radio, opt.label)
-	}
-	fmt.Fprintln(&b)
-	if len(m.nvidiaGPUs) == 0 {
-		fmt.Fprintln(&b, "GPUs: (none detected)")
-	} else {
-		fmt.Fprintln(&b, "GPUs:")
-		for i, gpu := range m.nvidiaGPUs {
-			check := "[ ]"
-			if m.nvidiaGPUSel[i] {
-				check = "[x]"
-			}
-			prefix := "  "
-			if m.nvidiaSATCursor == len(nvidiaDurationOptions)+i {
-				prefix = "> "
-			}
-			fmt.Fprintf(&b, "%s%s %d: %s (%d MB)\n", prefix, check, gpu.Index, gpu.Name, gpu.MemoryMB)
-		}
-	}
-	fmt.Fprintln(&b)
-	startIdx := len(nvidiaDurationOptions) + len(m.nvidiaGPUs)
-	startPfx := "  "
-	cancelPfx := "  "
-	if m.nvidiaSATCursor == startIdx {
-		startPfx = "> "
-	}
-	if m.nvidiaSATCursor == startIdx+1 {
-		cancelPfx = "> "
-	}
-	fmt.Fprintf(&b, "%sStart\n", startPfx)
-	fmt.Fprintf(&b, "%sCancel\n", cancelPfx)
-	fmt.Fprintln(&b)
-	b.WriteString("[↑/↓] move  [space] toggle  [enter] select  [esc] cancel\n")
-	return b.String()
-}
-
-// renderNvidiaSATRunning renders the running screen.
-func renderNvidiaSATRunning() string {
-	return "NVIDIA SAT\n\nTest is running...\n\n[a] Abort test  [ctrl+c] quit\n"
-}
--- a/audit/internal/tui/screen_services.go
+++ b/audit/internal/tui/screen_services.go
@@ -1,47 +0,0 @@
-package tui
-
-import (
-	"bee/audit/internal/platform"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-func (m model) handleServicesMenu() (tea.Model, tea.Cmd) {
-	if len(m.services) == 0 {
-		return m, resultCmd("Services", "No bee-* services found.", nil, screenSettings)
-	}
-	m.selectedService = m.services[m.cursor]
-	m.screen = screenServiceAction
-	m.cursor = 0
-	return m, nil
-}
-
-func (m model) handleServiceActionMenu() (tea.Model, tea.Cmd) {
-	action := m.serviceMenu[m.cursor]
-	if action == "back" {
-		m.screen = screenServices
-		m.cursor = 0
-		return m, nil
-	}
-
-	m.busy = true
-	m.busyTitle = "service: " + m.selectedService
-	return m, func() tea.Msg {
-		switch action {
-		case "Status":
-			result, err := m.app.ServiceStatusResult(m.selectedService)
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
-		case "Restart":
-			result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceRestart)
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
-		case "Start":
-			result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStart)
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
-		case "Stop":
-			result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStop)
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
-		default:
-			return resultMsg{title: "Service", body: "Unknown action.", back: screenServiceAction}
-		}
-	}
-}
--- a/audit/internal/tui/screen_settings.go
+++ b/audit/internal/tui/screen_settings.go
@@ -1,64 +0,0 @@
-package tui
-
-import tea "github.com/charmbracelet/bubbletea"
-
-func (m model) handleSettingsMenu() (tea.Model, tea.Cmd) {
-	switch m.cursor {
-	case 0: // Network
-		m.screen = screenNetwork
-		m.cursor = 0
-		return m, nil
-	case 1: // Services
-		m.busy = true
-		m.busyTitle = "Services"
-		return m, func() tea.Msg {
-			services, err := m.app.ListBeeServices()
-			return servicesMsg{services: services, err: err}
-		}
-	case 2: // Re-run audit
-		m.busy = true
-		m.busyTitle = "Re-run audit"
-		runtimeMode := m.runtimeMode
-		return m, func() tea.Msg {
-			result, err := m.app.RunAuditNow(runtimeMode)
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenSettings}
-		}
-	case 3: // Run self-check
-		m.busy = true
-		m.busyTitle = "Self-check"
-		return m, func() tea.Msg {
-			result, err := m.app.RunRuntimePreflightResult()
-			return resultMsg{title: result.Title, body: result.Body, err: err, back: screenSettings}
-		}
-	case 4: // Runtime issues
-		m.busy = true
-		m.busyTitle = "Runtime issues"
-		return m, func() tea.Msg {
-			result := m.app.RuntimeHealthResult()
-			return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
-		}
-	case 5: // Audit logs
-		m.busy = true
-		m.busyTitle = "Audit logs"
-		return m, func() tea.Msg {
-			result := m.app.AuditLogTailResult()
-			return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
-		}
-	case 6: // Check tools
-		m.busy = true
-		m.busyTitle = "Check tools"
-		return m, func() tea.Msg {
-			result := m.app.ToolCheckResult([]string{
-				"dmidecode", "smartctl", "nvme", "ipmitool", "lspci",
-				"ethtool", "bee", "nvidia-smi", "bee-gpu-stress",
-				"memtester", "dhclient", "lsblk", "mount",
-			})
-			return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
-		}
-	case 7: // Back
-		m.screen = screenMain
-		m.cursor = 0
-		return m, nil
-	}
-	return m, nil
-}
--- a/audit/internal/tui/snapshot.go
+++ b/audit/internal/tui/snapshot.go
@@ -1,30 +0,0 @@
-package tui
-
-import (
-	"bee/audit/internal/app"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-func (m model) refreshSnapshotCmd() tea.Cmd {
-	if m.app == nil {
-		return nil
-	}
-	return func() tea.Msg {
-		return snapshotMsg{
-			banner: m.app.MainBanner(),
-			panel:  m.app.LoadHardwarePanel(),
-		}
-	}
-}
-
-func shouldRefreshSnapshot(prev, next model) bool {
-	return prev.screen != next.screen || prev.busy != next.busy
-}
-
-func emptySnapshot() snapshotMsg {
-	return snapshotMsg{
-		banner: "",
-		panel:  app.HardwarePanelData{},
-	}
-}
--- a/audit/internal/tui/tui_test.go
+++ b/audit/internal/tui/tui_test.go
@@ -1,628 +0,0 @@
-package tui
-
-import (
-	"strings"
-	"testing"
-
-	"bee/audit/internal/app"
-	"bee/audit/internal/platform"
-	"bee/audit/internal/runtimeenv"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-func newTestModel() model {
-	return newModel(app.New(platform.New()), runtimeenv.ModeLocal)
-}
-
-func sendKey(t *testing.T, m model, key tea.KeyType) model {
-	t.Helper()
-
-	next, _ := m.Update(tea.KeyMsg{Type: key})
-	return next.(model)
-}
-
-func TestUpdateMainMenuCursorNavigation(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-
-	m = sendKey(t, m, tea.KeyDown)
-	if m.cursor != 1 {
-		t.Fatalf("cursor=%d want 1 after down", m.cursor)
-	}
-
-	m = sendKey(t, m, tea.KeyDown)
-	if m.cursor != 2 {
-		t.Fatalf("cursor=%d want 2 after second down", m.cursor)
-	}
-
-	m = sendKey(t, m, tea.KeyUp)
-	if m.cursor != 1 {
-		t.Fatalf("cursor=%d want 1 after up", m.cursor)
-	}
-}
-
-func TestUpdateMainMenuEnterActions(t *testing.T) {
-	t.Parallel()
-
-	tests := []struct {
-		name       string
-		cursor     int
-		wantScreen screen
-		wantBusy   bool
-		wantCmd    bool
-	}{
-		{name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true},
-		{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
-		{name: "settings", cursor: 2, wantScreen: screenSettings, wantCmd: true},
-		{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true},
-	}
-
-	for _, test := range tests {
-		test := test
-		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
-			m := newTestModel()
-			m.cursor = test.cursor
-
-			next, cmd := m.Update(tea.KeyMsg{Type: tea.KeyEnter})
-			got := next.(model)
-
-			if got.screen != test.wantScreen {
-				t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
-			}
-			if got.busy != test.wantBusy {
-				t.Fatalf("busy=%v want %v", got.busy, test.wantBusy)
-			}
-			if (cmd != nil) != test.wantCmd {
-				t.Fatalf("cmd present=%v want %v", cmd != nil, test.wantCmd)
-			}
-		})
-	}
-}
-
-func TestUpdateConfirmCancelViaKeys(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenConfirm
-	m.pendingAction = actionRunMemorySAT
-
-	next, _ := m.Update(tea.KeyMsg{Type: tea.KeyRight})
-	got := next.(model)
-	if got.cursor != 1 {
-		t.Fatalf("cursor=%d want 1 after right", got.cursor)
-	}
-
-	next, _ = got.Update(tea.KeyMsg{Type: tea.KeyEnter})
-	got = next.(model)
-	if got.screen != screenHealthCheck {
-		t.Fatalf("screen=%q want %q", got.screen, screenHealthCheck)
-	}
-	if got.cursor != 0 {
-		t.Fatalf("cursor=%d want 0 after cancel", got.cursor)
-	}
-}
-
-func TestMainMenuSimpleTransitions(t *testing.T) {
-	t.Parallel()
-
-	tests := []struct {
-		name       string
-		cursor     int
-		wantScreen screen
-	}{
-		{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
-		{name: "settings", cursor: 2, wantScreen: screenSettings},
-	}
-
-	for _, test := range tests {
-		test := test
-		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
-			m := newTestModel()
-			m.cursor = test.cursor
-
-			next, cmd := m.handleMainMenu()
-			got := next.(model)
-
-			if cmd != nil {
-				t.Fatalf("expected nil cmd for %s", test.name)
-			}
-			if got.screen != test.wantScreen {
-				t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
-			}
-			if got.cursor != 0 {
-				t.Fatalf("cursor=%d want 0", got.cursor)
-			}
-		})
-	}
-}
-
-func TestMainMenuExportSetsBusy(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.cursor = 1 // Export support bundle
-
-	next, cmd := m.handleMainMenu()
-	got := next.(model)
-
-	if !got.busy {
-		t.Fatal("busy=false for export")
-	}
-	if cmd == nil {
-		t.Fatal("expected async cmd for export")
-	}
-}
-
-func TestMainViewRendersTwoColumns(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.cursor = 1
-
-	view := m.View()
-	for _, want := range []string{
-		"bee",
-		"Health Check",
-		"> Export support bundle",
-		"Settings",
-		"Exit",
-		"│",
-		"[↑↓] move",
-	} {
-		if !strings.Contains(view, want) {
-			t.Fatalf("view missing %q\nview:\n%s", want, view)
-		}
-	}
-}
-
-func TestEscapeNavigation(t *testing.T) {
-	t.Parallel()
-
-	tests := []struct {
-		name       string
-		screen     screen
-		wantScreen screen
-	}{
-		{name: "network to settings", screen: screenNetwork, wantScreen: screenSettings},
-		{name: "services to settings", screen: screenServices, wantScreen: screenSettings},
-		{name: "settings to main", screen: screenSettings, wantScreen: screenMain},
-		{name: "service action to services", screen: screenServiceAction, wantScreen: screenServices},
-		{name: "export targets to main", screen: screenExportTargets, wantScreen: screenMain},
-		{name: "interface pick to network", screen: screenInterfacePick, wantScreen: screenNetwork},
-	}
-
-	for _, test := range tests {
-		test := test
-		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
-			m := newTestModel()
-			m.screen = test.screen
-			m.cursor = 3
-
-			next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEsc})
-			got := next.(model)
-
-			if got.screen != test.wantScreen {
-				t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
-			}
-			if got.cursor != 0 {
-				t.Fatalf("cursor=%d want 0", got.cursor)
-			}
-		})
-	}
-}
-
-func TestHealthCheckEscReturnsToMain(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenHealthCheck
-	m.hcCursor = 3
-
-	next, _ := m.updateHealthCheck(tea.KeyMsg{Type: tea.KeyEsc})
-	got := next.(model)
-
-	if got.screen != screenMain {
-		t.Fatalf("screen=%q want %q", got.screen, screenMain)
-	}
-	if got.cursor != 0 {
-		t.Fatalf("cursor=%d want 0", got.cursor)
-	}
-}
-
-func TestOutputScreenReturnsToPreviousScreen(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenOutput
-	m.prevScreen = screenNetwork
-	m.title = "title"
-	m.body = "body"
-
-	next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEnter})
-	got := next.(model)
-
-	if got.screen != screenNetwork {
-		t.Fatalf("screen=%q want %q", got.screen, screenNetwork)
-	}
-	if got.title != "" || got.body != "" {
-		t.Fatalf("expected output state cleared, got title=%q body=%q", got.title, got.body)
-	}
-}
-
-func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenHealthCheck
-	m.hcInitialized = true
-	m.hcSel = [4]bool{true, true, true, true}
-
-	next, cmd := m.hcRunSingle(hcGPU)
-	got := next.(model)
-
-	if cmd == nil {
-		t.Fatal("expected non-nil cmd (GPU list loader)")
-	}
-	if got.screen != screenNvidiaSATSetup {
-		t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
-	}
-
-	// esc from setup returns to health check
-	next, _ = got.updateNvidiaSATSetup(tea.KeyMsg{Type: tea.KeyEsc})
-	got = next.(model)
-	if got.screen != screenHealthCheck {
-		t.Fatalf("screen after esc=%q want %q", got.screen, screenHealthCheck)
-	}
-}
-
-func TestHealthCheckRunSingleMapsActions(t *testing.T) {
-	t.Parallel()
-
-	tests := []struct {
-		idx  int
-		want actionKind
-	}{
-		{idx: hcMemory, want: actionRunMemorySAT},
-		{idx: hcStorage, want: actionRunStorageSAT},
-	}
-
-	for _, test := range tests {
-		m := newTestModel()
-		m.screen = screenHealthCheck
-		m.hcInitialized = true
-
-		next, _ := m.hcRunSingle(test.idx)
-		got := next.(model)
-		if got.pendingAction != test.want {
-			t.Fatalf("idx=%d pendingAction=%q want %q", test.idx, got.pendingAction, test.want)
-		}
-		if got.screen != screenConfirm {
-			t.Fatalf("idx=%d screen=%q want %q", test.idx, got.screen, screenConfirm)
-		}
-	}
-}
-
-func TestExportTargetSelectionOpensConfirm(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenExportTargets
-	m.targets = []platform.RemovableTarget{{Device: "/dev/sdb1", FSType: "vfat", Size: "16G"}}
-
-	next, cmd := m.handleExportTargetsMenu()
-	got := next.(model)
-
-	if cmd != nil {
-		t.Fatal("expected nil cmd")
-	}
-	if got.screen != screenConfirm {
-		t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
-	}
-	if got.pendingAction != actionExportBundle {
-		t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionExportBundle)
-	}
-	if got.selectedTarget == nil || got.selectedTarget.Device != "/dev/sdb1" {
-		t.Fatalf("selectedTarget=%+v want /dev/sdb1", got.selectedTarget)
-	}
-}
-
-func TestInterfacePickStaticIPv4OpensForm(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.pendingAction = actionStaticIPv4
-	m.interfaces = []platform.InterfaceInfo{{Name: "eth0"}}
-
-	next, cmd := m.handleInterfacePickMenu()
-	got := next.(model)
-
-	if cmd != nil {
-		t.Fatal("expected nil cmd")
-	}
-	if got.screen != screenStaticForm {
-		t.Fatalf("screen=%q want %q", got.screen, screenStaticForm)
-	}
-	if got.selectedIface != "eth0" {
-		t.Fatalf("selectedIface=%q want eth0", got.selectedIface)
-	}
-	if len(got.formFields) != 4 {
-		t.Fatalf("len(formFields)=%d want 4", len(got.formFields))
-	}
-}
-
-func TestResultMsgUsesExplicitBackScreen(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenConfirm
-
-	next, _ := m.Update(resultMsg{title: "done", body: "ok", back: screenNetwork})
-	got := next.(model)
-
-	if got.screen != screenOutput {
-		t.Fatalf("screen=%q want %q", got.screen, screenOutput)
-	}
-	if got.prevScreen != screenNetwork {
-		t.Fatalf("prevScreen=%q want %q", got.prevScreen, screenNetwork)
-	}
-}
-
-func TestConfirmCancelTarget(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-
-	m.pendingAction = actionExportBundle
-	if got := m.confirmCancelTarget(); got != screenExportTargets {
-		t.Fatalf("export cancel target=%q want %q", got, screenExportTargets)
-	}
-
-	m.pendingAction = actionRunAll
-	if got := m.confirmCancelTarget(); got != screenHealthCheck {
-		t.Fatalf("run all cancel target=%q want %q", got, screenHealthCheck)
-	}
-
-	m.pendingAction = actionRunMemorySAT
-	if got := m.confirmCancelTarget(); got != screenHealthCheck {
-		t.Fatalf("memory sat cancel target=%q want %q", got, screenHealthCheck)
-	}
-
-	m.pendingAction = actionRunStorageSAT
-	if got := m.confirmCancelTarget(); got != screenHealthCheck {
-		t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
-	}
-
-	m.pendingAction = actionNone
-	if got := m.confirmCancelTarget(); got != screenMain {
-		t.Fatalf("default cancel target=%q want %q", got, screenMain)
-	}
-}
-
-func TestViewBusyStateIsMinimal(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.busy = true
-
-	view := m.View()
-	want := "bee\n\nWorking...\n\n[ctrl+c] quit\n"
-	if view != want {
-		t.Fatalf("busy view mismatch\nwant:\n%s\ngot:\n%s", want, view)
-	}
-}
-
-func TestViewBusyStateUsesBusyTitle(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.busy = true
-	m.busyTitle = "Export support bundle"
-
-	view := m.View()
-
-	for _, want := range []string{
-		"Export support bundle",
-		"Working...",
-		"[ctrl+c] quit",
-	} {
-		if !strings.Contains(view, want) {
-			t.Fatalf("view missing %q\nview:\n%s", want, view)
-		}
-	}
-}
-
-func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenOutput
-	m.title = "Run audit"
-	m.body = "audit output: /appdata/bee/export/bee-audit.json\n"
-
-	view := m.View()
-
-	for _, want := range []string{
-		"Run audit",
-		"audit output: /appdata/bee/export/bee-audit.json",
-		"[enter/esc] back  [ctrl+c] quit",
-	} {
-		if !strings.Contains(view, want) {
-			t.Fatalf("view missing %q\nview:\n%s", want, view)
-		}
-	}
-}
-
-func TestViewRendersBannerModuleAboveScreenBody(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.banner = "System: Demo Server\nIP: 10.0.0.10"
-	m.width = 60
-
-	view := m.View()
-
-	for _, want := range []string{
-		"┌ MOTD ",
-		"System: Demo Server",
-		"IP: 10.0.0.10",
-		"Health Check",
-		"Export support bundle",
-	} {
-		if !strings.Contains(view, want) {
-			t.Fatalf("view missing %q\nview:\n%s", want, view)
-		}
-	}
-}
-
-func TestSnapshotMsgUpdatesBannerAndPanel(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-
-	next, cmd := m.Update(snapshotMsg{
-		banner: "System: Demo",
-		panel: app.HardwarePanelData{
-			Header: []string{"Demo header"},
-			Rows: []app.ComponentRow{
-				{Key: "CPU", Status: "PASS", Detail: "ok"},
-			},
-		},
-	})
-	got := next.(model)
-
-	if cmd != nil {
-		t.Fatal("expected nil cmd")
-	}
-	if got.banner != "System: Demo" {
-		t.Fatalf("banner=%q want %q", got.banner, "System: Demo")
-	}
-	if len(got.panel.Rows) != 1 || got.panel.Rows[0].Key != "CPU" {
-		t.Fatalf("panel rows=%+v", got.panel.Rows)
-	}
-}
-
-func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenExportTargets
-	m.targets = []platform.RemovableTarget{
-		{
-			Device:     "/dev/sdb1",
-			FSType:     "vfat",
-			Size:       "29G",
-			Label:      "BEEUSB",
-			Mountpoint: "/media/bee",
-		},
-	}
-
-	view := m.View()
-
-	for _, want := range []string{
-		"Export support bundle",
-		"Select removable filesystem",
-		"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
-	} {
-		if !strings.Contains(view, want) {
-			t.Fatalf("view missing %q\nview:\n%s", want, view)
-		}
-	}
-}
-
-func TestViewStaticFormRendersFields(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenStaticForm
-	m.selectedIface = "enp1s0"
-	m.formFields = []formField{
-		{Label: "Address", Value: "192.0.2.10/24"},
-		{Label: "Gateway", Value: "192.0.2.1"},
-		{Label: "DNS", Value: "1.1.1.1"},
-	}
-	m.formIndex = 1
-
-	view := m.View()
-
-	for _, want := range []string{
-		"Static IPv4: enp1s0",
-		"  Address: 192.0.2.10/24",
-		"> Gateway: 192.0.2.1",
-		"  DNS: 1.1.1.1",
-		"[tab/↑/↓] move  [enter] next/submit  [backspace] delete  [esc] cancel",
-	} {
-		if !strings.Contains(view, want) {
-			t.Fatalf("view missing %q\nview:\n%s", want, view)
-		}
-	}
-}
-
-func TestViewConfirmScreenMatchesPendingExport(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.screen = screenConfirm
-	m.pendingAction = actionExportBundle
-	m.selectedTarget = &platform.RemovableTarget{Device: "/dev/sdb1"}
-
-	view := m.View()
-
-	for _, want := range []string{
-		"Export support bundle",
-		"Copy support bundle to /dev/sdb1?",
-		"> Confirm",
-		"  Cancel",
-	} {
-		if !strings.Contains(view, want) {
-			t.Fatalf("view missing %q\nview:\n%s", want, view)
-		}
-	}
-}
-
-func TestResultMsgClearsBusyAndPendingAction(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-	m.busy = true
-	m.busyTitle = "Export support bundle"
-	m.pendingAction = actionExportBundle
-	m.screen = screenConfirm
-
-	next, _ := m.Update(resultMsg{title: "Export support bundle", body: "done", back: screenMain})
-	got := next.(model)
-
-	if got.busy {
-		t.Fatal("busy=true want false")
-	}
-	if got.busyTitle != "" {
-		t.Fatalf("busyTitle=%q want empty", got.busyTitle)
-	}
-	if got.pendingAction != actionNone {
-		t.Fatalf("pendingAction=%q want empty", got.pendingAction)
-	}
-}
-
-func TestResultMsgErrorWithoutBodyFormatsCleanly(t *testing.T) {
-	t.Parallel()
-
-	m := newTestModel()
-
-	next, _ := m.Update(resultMsg{title: "Export support bundle", err: assertErr("boom"), back: screenMain})
-	got := next.(model)
-
-	if got.body != "ERROR: boom" {
-		t.Fatalf("body=%q want %q", got.body, "ERROR: boom")
-	}
-}
-
-type assertErr string
-
-func (e assertErr) Error() string { return string(e) }
--- a/audit/internal/tui/types.go
+++ b/audit/internal/tui/types.go
@@ -1,205 +0,0 @@
-package tui
-
-import (
-	"strings"
-	"time"
-
-	"bee/audit/internal/app"
-	"bee/audit/internal/platform"
-	"bee/audit/internal/runtimeenv"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-type screen string
-
-const (
-	screenMain             screen = "main"
-	screenHealthCheck      screen = "health_check"
-	screenSettings         screen = "settings"
-	screenNetwork          screen = "network"
-	screenInterfacePick    screen = "interface_pick"
-	screenServices         screen = "services"
-	screenServiceAction    screen = "service_action"
-	screenExportTargets    screen = "export_targets"
-	screenOutput           screen = "output"
-	screenStaticForm       screen = "static_form"
-	screenConfirm          screen = "confirm"
-	screenNvidiaSATSetup   screen = "nvidia_sat_setup"
-	screenNvidiaSATRunning screen = "nvidia_sat_running"
-	screenGPUStressRunning screen = "gpu_stress_running"
-)
-
-type actionKind string
-
-const (
-	actionNone          actionKind = ""
-	actionDHCPOne       actionKind = "dhcp_one"
-	actionStaticIPv4    actionKind = "static_ipv4"
-	actionExportBundle  actionKind = "export_bundle"
-	actionRunAll        actionKind = "run_all"
-	actionRunMemorySAT  actionKind = "run_memory_sat"
-	actionRunStorageSAT actionKind = "run_storage_sat"
-	actionRunCPUSAT     actionKind = "run_cpu_sat"
-	actionRunAMDGPUSAT   actionKind = "run_amd_gpu_sat"
-	actionRunFanStress   actionKind = "run_fan_stress"
-)
-
-type model struct {
-	app         *app.App
-	runtimeMode runtimeenv.Mode
-
-	screen       screen
-	prevScreen   screen
-	cursor       int
-	busy         bool
-	busyTitle    string
-	title        string
-	body         string
-	mainMenu     []string
-	settingsMenu []string
-	networkMenu  []string
-	serviceMenu  []string
-
-	services        []string
-	interfaces      []platform.InterfaceInfo
-	targets         []platform.RemovableTarget
-	selectedService string
-	selectedIface   string
-	selectedTarget  *platform.RemovableTarget
-	pendingAction   actionKind
-
-	formFields []formField
-	formIndex  int
-
-	// Hardware panel (right column)
-	panel       app.HardwarePanelData
-	panelFocus  bool
-	panelCursor int
-	banner      string
-
-	// Health Check screen
-	hcSel         [4]bool
-	hcMode        int
-	hcCursor      int
-	hcInitialized bool
-
-	// NVIDIA SAT setup
-	nvidiaGPUs      []platform.NvidiaGPU
-	nvidiaGPUSel    []bool
-	nvidiaDurIdx    int
-	nvidiaSATCursor int
-
-	// NVIDIA SAT running
-	nvidiaSATCancel  func()
-	nvidiaSATAborted bool
-
-	// GPU Platform Stress Test running
-	gpuStressCancel  func()
-	gpuStressAborted bool
-
-	// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
-	progressLines  []string
-	progressPrefix string
-	progressSince  time.Time
-
-	// Terminal size
-	width int
-}
-
-type formField struct {
-	Label string
-	Value string
-}
-
-func Run(application *app.App, runtimeMode runtimeenv.Mode) error {
-	options := []tea.ProgramOption{}
-	if runtimeMode != runtimeenv.ModeLiveCD {
-		options = append(options, tea.WithAltScreen())
-	}
-	program := tea.NewProgram(newModel(application, runtimeMode), options...)
-	_, err := program.Run()
-	return err
-}
-
-func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
-	return model{
-		app:         application,
-		runtimeMode: runtimeMode,
-		screen:      screenMain,
-		mainMenu: []string{
-			"Health Check",
-			"Export support bundle",
-			"Settings",
-			"Exit",
-		},
-		settingsMenu: []string{
-			"Network",
-			"Services",
-			"Re-run audit",
-			"Run self-check",
-			"Runtime issues",
-			"Audit logs",
-			"Check tools",
-			"Back",
-		},
-		networkMenu: []string{
-			"Show status",
-			"DHCP on all interfaces",
-			"DHCP on one interface",
-			"Set static IPv4",
-			"Back",
-		},
-		serviceMenu: []string{
-			"Status",
-			"Restart",
-			"Start",
-			"Stop",
-			"Back",
-		},
-	}
-}
-
-func (m model) Init() tea.Cmd {
-	return m.refreshSnapshotCmd()
-}
-
-func (m model) confirmBody() (string, string) {
-	switch m.pendingAction {
-	case actionExportBundle:
-		if m.selectedTarget == nil {
-			return "Export support bundle", "No target selected"
-		}
-		return "Export support bundle", "Copy support bundle to " + m.selectedTarget.Device + "?"
-	case actionRunAll:
-		modes := []string{"Quick", "Standard", "Express"}
-		mode := modes[m.hcMode]
-		var sel []string
-		names := []string{"GPU", "Memory", "Storage", "CPU"}
-		for i, on := range m.hcSel {
-			if on {
-				sel = append(sel, names[i])
-			}
-		}
-		if len(sel) == 0 {
-			return "Health Check", "No components selected."
-		}
-		return "Health Check", "Run: " + strings.Join(sel, " + ") + "\nMode: " + mode
-	case actionRunMemorySAT:
-		return "Memory test", "Run memtester?"
-	case actionRunStorageSAT:
-		return "Storage test", "Run storage diagnostic pack?"
-	case actionRunCPUSAT:
-		modes := []string{"Quick (60s)", "Standard (300s)", "Express (900s)"}
-		return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
-	case actionRunAMDGPUSAT:
-		return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
-	case actionRunFanStress:
-		modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
-		return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
-			"Monitors fans, temps, power — detects throttling.\n" +
-			"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
-	default:
-		return "Confirm", "Proceed?"
-	}
-}
--- a/audit/internal/tui/update.go
+++ b/audit/internal/tui/update.go
@@ -1,284 +0,0 @@
-package tui
-
-import (
-	"fmt"
-	"strings"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
-	switch msg := msg.(type) {
-	case tea.WindowSizeMsg:
-		m.width = msg.Width
-		return m, nil
-	case tea.KeyMsg:
-		if m.busy {
-			if msg.String() == "ctrl+c" {
-				return m, tea.Quit
-			}
-			return m, nil
-		}
-		next, cmd := m.updateKey(msg)
-		nextModel := next.(model)
-		if shouldRefreshSnapshot(m, nextModel) {
-			return nextModel, tea.Batch(cmd, nextModel.refreshSnapshotCmd())
-		}
-		return nextModel, cmd
-	case satProgressMsg:
-		if m.busy && m.progressPrefix != "" {
-			if len(msg.lines) > 0 {
-				m.progressLines = msg.lines
-			}
-			return m, pollSATProgress(m.progressPrefix, m.progressSince)
-		}
-		return m, nil
-	case snapshotMsg:
-		m.banner = msg.banner
-		m.panel = msg.panel
-		return m, nil
-	case resultMsg:
-		m.busy = false
-		m.busyTitle = ""
-		m.progressLines = nil
-		m.progressPrefix = ""
-		m.title = msg.title
-		if msg.err != nil {
-			body := strings.TrimSpace(msg.body)
-			if body == "" {
-				m.body = fmt.Sprintf("ERROR: %v", msg.err)
-			} else {
-				m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
-			}
-		} else {
-			m.body = msg.body
-		}
-		m.pendingAction = actionNone
-		if msg.back != "" {
-			m.prevScreen = msg.back
-		} else {
-			m.prevScreen = m.screen
-		}
-		m.screen = screenOutput
-		m.cursor = 0
-		return m, m.refreshSnapshotCmd()
-	case servicesMsg:
-		m.busy = false
-		m.busyTitle = ""
-		if msg.err != nil {
-			m.title = "Services"
-			m.body = msg.err.Error()
-			m.prevScreen = screenSettings
-			m.screen = screenOutput
-			return m, m.refreshSnapshotCmd()
-		}
-		m.services = msg.services
-		m.screen = screenServices
-		m.cursor = 0
-		return m, m.refreshSnapshotCmd()
-	case interfacesMsg:
-		m.busy = false
-		m.busyTitle = ""
-		if msg.err != nil {
-			m.title = "interfaces"
-			m.body = msg.err.Error()
-			m.prevScreen = screenNetwork
-			m.screen = screenOutput
-			return m, m.refreshSnapshotCmd()
-		}
-		m.interfaces = msg.ifaces
-		m.screen = screenInterfacePick
-		m.cursor = 0
-		return m, m.refreshSnapshotCmd()
-	case exportTargetsMsg:
-		m.busy = false
-		m.busyTitle = ""
-		if msg.err != nil {
-			m.title = "export"
-			m.body = msg.err.Error()
-			m.prevScreen = screenMain
-			m.screen = screenOutput
-			return m, m.refreshSnapshotCmd()
-		}
-		m.targets = msg.targets
-		m.screen = screenExportTargets
-		m.cursor = 0
-		return m, m.refreshSnapshotCmd()
-	case nvidiaGPUsMsg:
-		return m.handleNvidiaGPUsMsg(msg)
-	case nvtopClosedMsg:
-		return m, nil
-	case gpuStressDoneMsg:
-		if m.gpuStressAborted {
-			return m, nil
-		}
-		if m.gpuStressCancel != nil {
-			m.gpuStressCancel()
-			m.gpuStressCancel = nil
-		}
-		m.prevScreen = screenHealthCheck
-		m.screen = screenOutput
-		m.title = msg.title
-		if msg.err != nil {
-			body := strings.TrimSpace(msg.body)
-			if body == "" {
-				m.body = fmt.Sprintf("ERROR: %v", msg.err)
-			} else {
-				m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
-			}
-		} else {
-			m.body = msg.body
-		}
-		return m, m.refreshSnapshotCmd()
-	case nvidiaSATDoneMsg:
-		if m.nvidiaSATAborted {
-			return m, nil
-		}
-		if m.nvidiaSATCancel != nil {
-			m.nvidiaSATCancel()
-			m.nvidiaSATCancel = nil
-		}
-		m.prevScreen = screenHealthCheck
-		m.screen = screenOutput
-		m.title = msg.title
-		if msg.err != nil {
-			body := strings.TrimSpace(msg.body)
-			if body == "" {
-				m.body = fmt.Sprintf("ERROR: %v", msg.err)
-			} else {
-				m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
-			}
-		} else {
-			m.body = msg.body
-		}
-		return m, m.refreshSnapshotCmd()
-	}
-	return m, nil
-}
-
-func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	switch m.screen {
-	case screenMain:
-		return m.updateMain(msg)
-	case screenHealthCheck:
-		return m.updateHealthCheck(msg)
-	case screenSettings:
-		return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
-	case screenNetwork:
-		return m.updateMenu(msg, len(m.networkMenu), m.handleNetworkMenu)
-	case screenServices:
-		return m.updateMenu(msg, len(m.services), m.handleServicesMenu)
-	case screenServiceAction:
-		return m.updateMenu(msg, len(m.serviceMenu), m.handleServiceActionMenu)
-	case screenNvidiaSATSetup:
-		return m.updateNvidiaSATSetup(msg)
-	case screenNvidiaSATRunning:
-		return m.updateNvidiaSATRunning(msg)
-	case screenGPUStressRunning:
-		return m.updateGPUStressRunning(msg)
-	case screenExportTargets:
-		return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
-	case screenInterfacePick:
-		return m.updateMenu(msg, len(m.interfaces), m.handleInterfacePickMenu)
-	case screenOutput:
-		switch msg.String() {
-		case "esc", "enter", "q":
-			m.screen = m.prevScreen
-			m.body = ""
-			m.title = ""
-			m.pendingAction = actionNone
-			return m, nil
-		case "ctrl+c":
-			return m, tea.Quit
-		}
-	case screenStaticForm:
-		return m.updateStaticForm(msg)
-	case screenConfirm:
-		return m.updateConfirm(msg)
-	}
-	if msg.String() == "ctrl+c" {
-		return m, tea.Quit
-	}
-	return m, nil
-}
-
-// updateMain handles keys on the main (two-column) screen.
-func (m model) updateMain(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	if m.panelFocus {
-		return m.updateMainPanel(msg)
-	}
-	// Switch focus to right panel.
-	if (msg.String() == "tab" || msg.String() == "right" || msg.String() == "l") && len(m.panel.Rows) > 0 {
-		m.panelFocus = true
-		return m, nil
-	}
-	return m.updateMenu(msg, len(m.mainMenu), m.handleMainMenu)
-}
-
-// updateMainPanel handles keys when right panel has focus.
-func (m model) updateMainPanel(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	switch msg.String() {
-	case "up", "k":
-		if m.panelCursor > 0 {
-			m.panelCursor--
-		}
-	case "down", "j":
-		if m.panelCursor < len(m.panel.Rows)-1 {
-			m.panelCursor++
-		}
-	case "enter":
-		if m.panelCursor < len(m.panel.Rows) {
-			key := m.panel.Rows[m.panelCursor].Key
-			m.busy = true
-			m.busyTitle = key
-			return m, func() tea.Msg {
-				r := m.app.ComponentDetailResult(key)
-				return resultMsg{title: r.Title, body: r.Body, back: screenMain}
-			}
-		}
-	case "tab", "left", "h", "esc":
-		m.panelFocus = false
-	case "q", "ctrl+c":
-		return m, tea.Quit
-	}
-	return m, nil
-}
-
-func (m model) updateMenu(msg tea.KeyMsg, size int, onEnter func() (tea.Model, tea.Cmd)) (tea.Model, tea.Cmd) {
-	if size == 0 {
-		size = 1
-	}
-	switch msg.String() {
-	case "up", "k":
-		if m.cursor > 0 {
-			m.cursor--
-		}
-	case "down", "j":
-		if m.cursor < size-1 {
-			m.cursor++
-		}
-	case "enter":
-		return onEnter()
-	case "esc":
-		switch m.screen {
-		case screenNetwork, screenServices:
-			m.screen = screenSettings
-			m.cursor = 0
-		case screenSettings:
-			m.screen = screenMain
-			m.cursor = 0
-		case screenServiceAction:
-			m.screen = screenServices
-			m.cursor = 0
-		case screenExportTargets:
-			m.screen = screenMain
-			m.cursor = 0
-		case screenInterfacePick:
-			m.screen = screenNetwork
-			m.cursor = 0
-		}
-	case "q", "ctrl+c":
-		return m, tea.Quit
-	}
-	return m, nil
-}
--- a/audit/internal/tui/view.go
+++ b/audit/internal/tui/view.go
@@ -1,296 +0,0 @@
-package tui
-
-import (
-	"fmt"
-	"strings"
-
-	"bee/audit/internal/platform"
-
-	tea "github.com/charmbracelet/bubbletea"
-	"github.com/charmbracelet/lipgloss"
-)
-
-// Column widths for two-column main layout.
-const leftColWidth = 30
-
-var (
-	stylePass   = lipgloss.NewStyle().Foreground(lipgloss.Color("10")) // bright green
-	styleFail   = lipgloss.NewStyle().Foreground(lipgloss.Color("9"))  // bright red
-	styleCancel = lipgloss.NewStyle().Foreground(lipgloss.Color("11")) // bright yellow
-	styleNA     = lipgloss.NewStyle().Foreground(lipgloss.Color("8"))  // dark gray
-)
-
-func colorStatus(status string) string {
-	switch status {
-	case "PASS":
-		return stylePass.Render("PASS")
-	case "FAIL":
-		return styleFail.Render("FAIL")
-	case "CANCEL":
-		return styleCancel.Render("CANC")
-	default:
-		return styleNA.Render("N/A ")
-	}
-}
-
-func (m model) View() string {
-	var body string
-	if m.busy {
-		title := "bee"
-		if m.busyTitle != "" {
-			title = m.busyTitle
-		}
-		if len(m.progressLines) > 0 {
-			var b strings.Builder
-			fmt.Fprintf(&b, "%s\n\n", title)
-			for _, l := range m.progressLines {
-				fmt.Fprintf(&b, "  %s\n", l)
-			}
-			b.WriteString("\n[ctrl+c] quit\n")
-			body = b.String()
-		} else {
-			body = fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title)
-		}
-	} else {
-		switch m.screen {
-		case screenMain:
-			body = renderTwoColumnMain(m)
-		case screenHealthCheck:
-			body = renderHealthCheck(m)
-		case screenSettings:
-			body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
-		case screenNetwork:
-			body = renderMenu("Network", "Select action", m.networkMenu, m.cursor)
-		case screenServices:
-			body = renderMenu("Services", "Select service", m.services, m.cursor)
-		case screenServiceAction:
-			body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
-		case screenExportTargets:
-			body = renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
-		case screenInterfacePick:
-			body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
-		case screenStaticForm:
-			body = renderForm("Static IPv4: "+m.selectedIface, m.formFields, m.formIndex)
-		case screenConfirm:
-			title, confirmBody := m.confirmBody()
-			body = renderConfirm(title, confirmBody, m.cursor)
-		case screenNvidiaSATSetup:
-			body = renderNvidiaSATSetup(m)
-		case screenNvidiaSATRunning:
-			body = renderNvidiaSATRunning()
-		case screenGPUStressRunning:
-			body = renderGPUStressRunning()
-		case screenOutput:
-			body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back  [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
-		default:
-			body = "bee\n"
-		}
-	}
-	return m.renderWithBanner(body)
-}
-
-// renderTwoColumnMain renders the main screen with menu on the left and hardware panel on the right.
-func renderTwoColumnMain(m model) string {
-	// Left column lines
-	leftLines := []string{"bee", ""}
-	for i, item := range m.mainMenu {
-		pfx := "  "
-		if !m.panelFocus && m.cursor == i {
-			pfx = "> "
-		}
-		leftLines = append(leftLines, pfx+item)
-	}
-
-	// Right column lines
-	rightLines := buildPanelLines(m)
-
-	// Render side by side
-	var b strings.Builder
-	maxRows := max(len(leftLines), len(rightLines))
-	for i := 0; i < maxRows; i++ {
-		l := ""
-		if i < len(leftLines) {
-			l = leftLines[i]
-		}
-		r := ""
-		if i < len(rightLines) {
-			r = rightLines[i]
-		}
-		w := lipgloss.Width(l)
-		if w < leftColWidth {
-			l += strings.Repeat(" ", leftColWidth-w)
-		}
-		b.WriteString(l + " │ " + r + "\n")
-	}
-
-	sep := strings.Repeat("─", leftColWidth) + "─┴─" + strings.Repeat("─", 46)
-	b.WriteString(sep + "\n")
-
-	if m.panelFocus {
-		b.WriteString("[↑↓] move  [enter] details  [tab/←] menu  [ctrl+c] quit\n")
-	} else {
-		b.WriteString("[↑↓] move  [enter] select  [tab/→] panel  [ctrl+c] quit\n")
-	}
-
-	return b.String()
-}
-
-func buildPanelLines(m model) []string {
-	p := m.panel
-	var lines []string
-
-	for _, h := range p.Header {
-		lines = append(lines, h)
-	}
-	if len(p.Header) > 0 && len(p.Rows) > 0 {
-		lines = append(lines, "")
-	}
-
-	for i, row := range p.Rows {
-		pfx := "  "
-		if m.panelFocus && m.panelCursor == i {
-			pfx = "> "
-		}
-		status := colorStatus(row.Status)
-		lines = append(lines, fmt.Sprintf("%s%s  %-4s  %s", pfx, status, row.Key, row.Detail))
-	}
-
-	return lines
-}
-
-func renderTargetItems(targets []platform.RemovableTarget) []string {
-	items := make([]string, 0, len(targets))
-	for _, target := range targets {
-		desc := fmt.Sprintf("%s [%s %s]", target.Device, target.FSType, target.Size)
-		if target.Label != "" {
-			desc += " label=" + target.Label
-		}
-		if target.Mountpoint != "" {
-			desc += " mounted=" + target.Mountpoint
-		}
-		items = append(items, desc)
-	}
-	return items
-}
-
-func renderInterfaceItems(interfaces []platform.InterfaceInfo) []string {
-	items := make([]string, 0, len(interfaces))
-	for _, iface := range interfaces {
-		label := iface.Name
-		if len(iface.IPv4) > 0 {
-			label += " [" + strings.Join(iface.IPv4, ", ") + "]"
-		}
-		items = append(items, label)
-	}
-	return items
-}
-
-func renderMenu(title, subtitle string, items []string, cursor int) string {
-	var body strings.Builder
-	fmt.Fprintf(&body, "%s\n\n%s\n\n", title, subtitle)
-	if len(items) == 0 {
-		body.WriteString("(no items)\n")
-	} else {
-		for i, item := range items {
-			prefix := "  "
-			if i == cursor {
-				prefix = "> "
-			}
-			fmt.Fprintf(&body, "%s%s\n", prefix, item)
-		}
-	}
-	body.WriteString("\n[↑/↓] move  [enter] select  [esc] back  [ctrl+c] quit\n")
-	return body.String()
-}
-
-func renderForm(title string, fields []formField, idx int) string {
-	var body strings.Builder
-	fmt.Fprintf(&body, "%s\n\n", title)
-	for i, field := range fields {
-		prefix := "  "
-		if i == idx {
-			prefix = "> "
-		}
-		fmt.Fprintf(&body, "%s%s: %s\n", prefix, field.Label, field.Value)
-	}
-	body.WriteString("\n[tab/↑/↓] move  [enter] next/submit  [backspace] delete  [esc] cancel\n")
-	return body.String()
-}
-
-func renderConfirm(title, body string, cursor int) string {
-	options := []string{"Confirm", "Cancel"}
-	var out strings.Builder
-	fmt.Fprintf(&out, "%s\n\n%s\n\n", title, body)
-	for i, option := range options {
-		prefix := "  "
-		if i == cursor {
-			prefix = "> "
-		}
-		fmt.Fprintf(&out, "%s%s\n", prefix, option)
-	}
-	out.WriteString("\n[←/→/↑/↓] move  [enter] select  [esc] cancel\n")
-	return out.String()
-}
-
-func resultCmd(title, body string, err error, back screen) tea.Cmd {
-	return func() tea.Msg {
-		return resultMsg{title: title, body: body, err: err, back: back}
-	}
-}
-
-func (m model) renderWithBanner(body string) string {
-	body = strings.TrimRight(body, "\n")
-	banner := renderBannerModule(m.banner, m.width)
-	if banner == "" {
-		if body == "" {
-			return ""
-		}
-		return body + "\n"
-	}
-	if body == "" {
-		return banner + "\n"
-	}
-	return banner + "\n\n" + body + "\n"
-}
-
-func renderBannerModule(banner string, width int) string {
-	banner = strings.TrimSpace(banner)
-	if banner == "" {
-		return ""
-	}
-
-	lines := strings.Split(banner, "\n")
-	contentWidth := 0
-	for _, line := range lines {
-		if w := lipgloss.Width(line); w > contentWidth {
-			contentWidth = w
-		}
-	}
-	if width > 0 && width-4 > contentWidth {
-		contentWidth = width - 4
-	}
-	if contentWidth < 20 {
-		contentWidth = 20
-	}
-
-	label := " MOTD "
-	topFill := contentWidth + 2 - lipgloss.Width(label)
-	if topFill < 0 {
-		topFill = 0
-	}
-
-	var b strings.Builder
-	b.WriteString("┌" + label + strings.Repeat("─", topFill) + "┐\n")
-	for _, line := range lines {
-		b.WriteString("│ " + padRight(line, contentWidth) + " │\n")
-	}
-	b.WriteString("└" + strings.Repeat("─", contentWidth+2) + "┘")
-	return b.String()
-}
-
-func padRight(value string, width int) string {
-	if gap := width - lipgloss.Width(value); gap > 0 {
-		return value + strings.Repeat(" ", gap)
-	}
-	return value
-}
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -0,0 +1,204 @@
+package webui
+
+import (
+	"encoding/json"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
+	req.ContentLength = -1
+	rec := httptest.NewRecorder()
+
+	h.handleAPISATRun("cpu").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
+		t.Fatalf("burn profile=%q want smoke", got)
+	}
+}
+
+func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 3, Name: "NVIDIA H100 PCIe"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRun(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-benchmark" {
+		t.Fatalf("target=%q want nvidia-benchmark", task.Target)
+	}
+	if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
+		t.Fatalf("gpu indices=%v want [1 3]", got)
+	}
+	if task.params.RunNCCL {
+		t.Fatal("RunNCCL should reflect explicit false from request")
+	}
+}
+
+func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRun(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var resp taskRunResponse
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode response: %v", err)
+	}
+	if len(resp.TaskIDs) != 2 {
+		t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
+func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
+func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
+	h := &handler{}
+	h.pushFanRings([]platform.FanReading{
+		{Name: "FAN_A", RPM: 4200},
+		{Name: "FAN_B", RPM: 5100},
+	})
+	h.pushFanRings([]platform.FanReading{
+		{Name: "FAN_B", RPM: 5200},
+	})
+
+	if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
+		t.Fatalf("fanNames=%v", h.fanNames)
+	}
+	aVals, _ := h.ringFans[0].snapshot()
+	bVals, _ := h.ringFans[1].snapshot()
+	if len(aVals) != 2 || len(bVals) != 2 {
+		t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
+	}
+	if aVals[1] != 4200 {
+		t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
+	}
+	if bVals[1] != 5200 {
+		t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
+	}
+}
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -0,0 +1,871 @@
+package webui
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+type chartTimelineSegment struct {
+	Start  time.Time
+	End    time.Time
+	Active bool
+}
+
+type chartScale struct {
+	Min   float64
+	Max   float64
+	Ticks []float64
+}
+
+type chartLayout struct {
+	Width      int
+	Height     int
+	PlotLeft   int
+	PlotRight  int
+	PlotTop    int
+	PlotBottom int
+}
+
+type metricChartSeries struct {
+	Name      string
+	AxisTitle string
+	Color     string
+	Values    []float64
+}
+
+var metricChartPalette = []string{
+	"#5794f2",
+	"#73bf69",
+	"#f2cc0c",
+	"#ff9830",
+	"#f2495c",
+	"#b877d9",
+	"#56d2f7",
+	"#8ab8ff",
+	"#9adf8f",
+	"#ffbe5c",
+}
+
+var gpuLabelCache struct {
+	mu       sync.Mutex
+	loadedAt time.Time
+	byIndex  map[int]string
+}
+
+func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{time.Time{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range datasets {
+		if len(datasets[i]) == 0 {
+			datasets[i] = make([]float64, pointCount)
+		}
+	}
+
+	// Downsample to at most ~1400 points (one per pixel) before building SVG.
+	times, datasets = downsampleTimeSeries(times, datasets, 1400)
+	pointCount = len(times)
+
+	statsLabel := chartStatsLabel(datasets)
+
+	legendItems := []metricChartSeries{}
+	for i, name := range names {
+		color := metricChartPalette[i%len(metricChartPalette)]
+		values := make([]float64, pointCount)
+		if i < len(datasets) {
+			copy(values, coalesceDataset(datasets[i], pointCount))
+		}
+		legendItems = append(legendItems, metricChartSeries{
+			Name:   name,
+			Color:  color,
+			Values: values,
+		})
+	}
+
+	scale := singleAxisChartScale(datasets, yMin, yMax)
+	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, layout.Width, layout.Height)
+	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scale)
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+	writeSingleAxisY(&b, layout, scale)
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+	for _, item := range legendItems {
+		writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
+	}
+	writeLegend(&b, layout, legendItems)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
+	temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
+	power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
+	coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
+	if temp == nil && power == nil && coreClock == nil {
+		return nil, false, nil
+	}
+	labels := sampleTimeLabels(samples)
+	times := sampleTimes(samples)
+	svg, err := drawGPUOverviewChartSVG(
+		gpuDisplayLabel(idx)+" Overview",
+		labels,
+		times,
+		[]metricChartSeries{
+			{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
+			{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
+			{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
+		},
+		timeline,
+	)
+	if err != nil {
+		return nil, false, err
+	}
+	return svg, true, nil
+}
+
+func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
+	if len(series) != 3 {
+		return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
+	}
+	const (
+		width      = 1400
+		height     = 840
+		plotLeft   = 180
+		plotRight  = 1220
+		plotTop    = 96
+		plotBottom = 660
+	)
+	const (
+		leftOuterAxis  = 72
+		leftInnerAxis  = 132
+		rightInnerAxis = 1268
+	)
+	layout := chartLayout{
+		Width:      width,
+		Height:     height,
+		PlotLeft:   plotLeft,
+		PlotRight:  plotRight,
+		PlotTop:    plotTop,
+		PlotBottom: plotBottom,
+	}
+	axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{time.Time{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range series {
+		if len(series[i].Values) == 0 {
+			series[i].Values = make([]float64, pointCount)
+		}
+	}
+
+	// Downsample to at most ~1400 points before building SVG.
+	{
+		datasets := make([][]float64, len(series))
+		for i := range series {
+			datasets[i] = series[i].Values
+		}
+		times, datasets = downsampleTimeSeries(times, datasets, 1400)
+		pointCount = len(times)
+		for i := range series {
+			series[i].Values = datasets[i]
+		}
+	}
+
+	scales := make([]chartScale, len(series))
+	for i := range series {
+		min, max := chartSeriesBounds(series[i].Values)
+		ticks := chartNiceTicks(min, max, 8)
+		scales[i] = chartScale{
+			Min:   ticks[0],
+			Max:   ticks[len(ticks)-1],
+			Ticks: ticks,
+		}
+	}
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, width, height)
+	writeChartFrame(&b, title, "", width, height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scales[0])
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+
+	for i, axisLineX := range axisX {
+		fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
+			axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
+		fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
+			axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
+		for _, tick := range scales[i].Ticks {
+			y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
+			label := sanitizeChartText(chartYAxisNumber(tick))
+			if i < 2 {
+				fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
+					axisLineX, y, axisLineX+6, y, series[i].Color)
+				fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
+					axisLineX-8, y, series[i].Color, label)
+				continue
+			}
+			fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
+				axisLineX, y, axisLineX-6, y, series[i].Color)
+			fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
+				axisLineX+8, y, series[i].Color, label)
+		}
+	}
+
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+	for i := range series {
+		writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
+	}
+	writeLegend(&b, layout, series)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
+	if len(samples) == 0 {
+		return nil
+	}
+	times := sampleTimes(samples)
+	start, end := chartTimeBounds(times)
+	if start.IsZero() || end.IsZero() {
+		return nil
+	}
+	return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
+}
+
+func snapshotTaskHistory() []Task {
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	out := make([]Task, len(globalQueue.tasks))
+	for i, t := range globalQueue.tasks {
+		out[i] = *t
+	}
+	return out
+}
+
+func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
+	if start.IsZero() || end.IsZero() {
+		return nil
+	}
+	if end.Before(start) {
+		start, end = end, start
+	}
+	type interval struct {
+		start time.Time
+		end   time.Time
+	}
+	active := make([]interval, 0, len(tasks))
+	for _, task := range tasks {
+		if task.StartedAt == nil {
+			continue
+		}
+		intervalStart := task.StartedAt.UTC()
+		intervalEnd := now.UTC()
+		if task.DoneAt != nil {
+			intervalEnd = task.DoneAt.UTC()
+		}
+		if !intervalEnd.After(intervalStart) {
+			continue
+		}
+		if intervalEnd.Before(start) || intervalStart.After(end) {
+			continue
+		}
+		if intervalStart.Before(start) {
+			intervalStart = start
+		}
+		if intervalEnd.After(end) {
+			intervalEnd = end
+		}
+		active = append(active, interval{start: intervalStart, end: intervalEnd})
+	}
+	sort.Slice(active, func(i, j int) bool {
+		if active[i].start.Equal(active[j].start) {
+			return active[i].end.Before(active[j].end)
+		}
+		return active[i].start.Before(active[j].start)
+	})
+	merged := make([]interval, 0, len(active))
+	for _, span := range active {
+		if len(merged) == 0 {
+			merged = append(merged, span)
+			continue
+		}
+		last := &merged[len(merged)-1]
+		if !span.start.After(last.end) {
+			if span.end.After(last.end) {
+				last.end = span.end
+			}
+			continue
+		}
+		merged = append(merged, span)
+	}
+
+	segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
+	cursor := start
+	for _, span := range merged {
+		if span.start.After(cursor) {
+			segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
+		}
+		segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
+		cursor = span.end
+	}
+	if cursor.Before(end) {
+		segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
+	}
+	if len(segments) == 0 {
+		segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
+	}
+	return segments
+}
+
+func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
+	times := make([]time.Time, 0, len(samples))
+	for _, sample := range samples {
+		times = append(times, sample.Timestamp)
+	}
+	return times
+}
+
+func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
+	min, max := 0.0, 1.0
+	if yMin != nil && yMax != nil {
+		min, max = *yMin, *yMax
+	} else {
+		min, max = chartSeriesBounds(flattenDatasets(datasets))
+		if yMin != nil {
+			min = *yMin
+		}
+		if yMax != nil {
+			max = *yMax
+		}
+	}
+	ticks := chartNiceTicks(min, max, 8)
+	return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
+}
+
+func flattenDatasets(datasets [][]float64) []float64 {
+	total := 0
+	for _, ds := range datasets {
+		total += len(ds)
+	}
+	out := make([]float64, 0, total)
+	for _, ds := range datasets {
+		out = append(out, ds...)
+	}
+	return out
+}
+
+func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
+	legendRows := 0
+	if chartLegendVisible(seriesCount) && seriesCount > 0 {
+		cols := 4
+		if seriesCount < cols {
+			cols = seriesCount
+		}
+		legendRows = (seriesCount + cols - 1) / cols
+	}
+	legendHeight := 0
+	if legendRows > 0 {
+		legendHeight = legendRows*24 + 24
+	}
+	return chartLayout{
+		Width:      1400,
+		Height:     canvasHeight,
+		PlotLeft:   96,
+		PlotRight:  1352,
+		PlotTop:    72,
+		PlotBottom: canvasHeight - 60 - legendHeight,
+	}
+}
+
+func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
+	if len(times) == 0 {
+		return time.Time{}, time.Time{}
+	}
+	start := times[0].UTC()
+	end := start
+	for _, ts := range times[1:] {
+		t := ts.UTC()
+		if t.Before(start) {
+			start = t
+		}
+		if t.After(end) {
+			end = t
+		}
+	}
+	return start, end
+}
+
+func synthesizeChartTimes(times []time.Time, count int) []time.Time {
+	if count <= 0 {
+		return nil
+	}
+	if len(times) == count {
+		return times
+	}
+	if len(times) == 1 {
+		out := make([]time.Time, count)
+		for i := range out {
+			out[i] = times[0].Add(time.Duration(i) * time.Minute)
+		}
+		return out
+	}
+	base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
+	out := make([]time.Time, count)
+	for i := range out {
+		out[i] = base.Add(time.Duration(i) * time.Minute)
+	}
+	return out
+}
+
+func writeSVGOpen(b *strings.Builder, width, height int) {
+	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
+}
+
+func writeSVGClose(b *strings.Builder) {
+	b.WriteString("</svg>\n")
+}
+
+func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
+	fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
+	fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
+		width/2, sanitizeChartText(title))
+	if strings.TrimSpace(subtitle) != "" {
+		fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
+			width/2, sanitizeChartText(subtitle))
+	}
+}
+
+func writePlotBorder(b *strings.Builder, layout chartLayout) {
+	fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
+		layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
+}
+
+func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
+	b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
+	for _, tick := range scale.Ticks {
+		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
+			layout.PlotLeft, y, layout.PlotRight, y)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
+	if pointCount <= 0 {
+		return
+	}
+	start, end := chartTimeBounds(times)
+	b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
+	for _, idx := range gpuChartLabelIndices(pointCount, target) {
+		ts := chartPointTime(times, idx)
+		x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
+		fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
+			x, layout.PlotTop, x, layout.PlotBottom)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
+	fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
+		layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
+	for _, tick := range scale.Ticks {
+		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
+			layout.PlotLeft, y, layout.PlotLeft-6, y)
+		fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
+			layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
+	}
+}
+
+func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
+	for _, idx := range gpuChartLabelIndices(pointCount, target) {
+		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
+		label := ""
+		if idx < len(labels) {
+			label = labels[idx]
+		}
+		fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
+	}
+	b.WriteString(`</g>` + "\n")
+	fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
+		(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
+}
+
+func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
+	if len(values) == 0 {
+		return
+	}
+	var points strings.Builder
+	for idx, value := range values {
+		if idx > 0 {
+			points.WriteByte(' ')
+		}
+		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
+		points.String(), color)
+	if len(values) == 1 {
+		x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
+		return
+	}
+	peakIdx := 0
+	peakValue := values[0]
+	for idx, value := range values[1:] {
+		if value >= peakValue {
+			peakIdx = idx + 1
+			peakValue = value
+		}
+	}
+	x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
+	y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
+	fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
+	fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
+		x, y-10, x-5, y-18, x+5, y-18, color)
+}
+
+func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
+	if !chartLegendVisible(len(series)) || len(series) == 0 {
+		return
+	}
+	cols := 4
+	if len(series) < cols {
+		cols = len(series)
+	}
+	cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
+	baseY := layout.PlotBottom + 74
+	for i, item := range series {
+		row := i / cols
+		col := i % cols
+		x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
+		y := float64(baseY + row*24)
+		fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
+			x, y, x+28, y, item.Color)
+		fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
+			x+38, y+4, sanitizeChartText(item.Name))
+	}
+}
+
+func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
+	if len(segments) == 0 {
+		return
+	}
+	b.WriteString(`<g data-role="timeline-overlay">` + "\n")
+	for _, segment := range segments {
+		if segment.Active || !segment.End.After(segment.Start) {
+			continue
+		}
+		x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
+		x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
+		fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
+			x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
+	if len(segments) == 0 {
+		return
+	}
+	seen := map[int]bool{}
+	b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
+	for i, segment := range segments {
+		if i > 0 {
+			x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
+			if !seen[x] {
+				seen[x] = true
+				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
+			}
+		}
+		if i < len(segments)-1 {
+			x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
+			if !seen[x] {
+				seen[x] = true
+				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
+			}
+		}
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+// downsampleTimeSeries reduces the time series to at most maxPts points using
+// min-max bucketing. Each bucket contributes the index of its min and max value
+// (using the first full-length dataset as the reference series). All parallel
+// datasets are sampled at those same indices so all series stay aligned.
+// If len(times) <= maxPts the inputs are returned unchanged.
+func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
+	n := len(times)
+	if n <= maxPts || maxPts <= 0 {
+		return times, datasets
+	}
+	buckets := maxPts / 2
+	if buckets < 1 {
+		buckets = 1
+	}
+	// Use the first dataset that has the same length as times as the reference
+	// for deciding which two indices to keep per bucket.
+	var ref []float64
+	for _, ds := range datasets {
+		if len(ds) == n {
+			ref = ds
+			break
+		}
+	}
+	selected := make([]int, 0, maxPts)
+	bucketSize := float64(n) / float64(buckets)
+	for b := 0; b < buckets; b++ {
+		lo := int(math.Round(float64(b) * bucketSize))
+		hi := int(math.Round(float64(b+1) * bucketSize))
+		if hi > n {
+			hi = n
+		}
+		if lo >= hi {
+			continue
+		}
+		if ref == nil {
+			selected = append(selected, lo)
+			if hi-1 != lo {
+				selected = append(selected, hi-1)
+			}
+			continue
+		}
+		minIdx, maxIdx := lo, lo
+		for i := lo + 1; i < hi; i++ {
+			if ref[i] < ref[minIdx] {
+				minIdx = i
+			}
+			if ref[i] > ref[maxIdx] {
+				maxIdx = i
+			}
+		}
+		if minIdx <= maxIdx {
+			selected = append(selected, minIdx)
+			if maxIdx != minIdx {
+				selected = append(selected, maxIdx)
+			}
+		} else {
+			selected = append(selected, maxIdx)
+			if minIdx != maxIdx {
+				selected = append(selected, minIdx)
+			}
+		}
+	}
+	outTimes := make([]time.Time, len(selected))
+	for i, idx := range selected {
+		outTimes[i] = times[idx]
+	}
+	outDatasets := make([][]float64, len(datasets))
+	for d, ds := range datasets {
+		if len(ds) != n {
+			outDatasets[d] = ds
+			continue
+		}
+		out := make([]float64, len(selected))
+		for i, idx := range selected {
+			out[i] = ds[idx]
+		}
+		outDatasets[d] = out
+	}
+	return outTimes, outDatasets
+}
+
+func chartXForTime(ts, start, end time.Time, left, right int) float64 {
+	if !end.After(start) {
+		return float64(left+right) / 2
+	}
+	if ts.Before(start) {
+		ts = start
+	}
+	if ts.After(end) {
+		ts = end
+	}
+	ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
+	return float64(left) + ratio*float64(right-left)
+}
+
+func chartPointTime(times []time.Time, idx int) time.Time {
+	if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
+		return times[idx].UTC()
+	}
+	if len(times) > 0 && !times[0].IsZero() {
+		return times[0].UTC().Add(time.Duration(idx) * time.Minute)
+	}
+	return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
+}
+
+func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
+	if scale.Max <= scale.Min {
+		return float64(plotTop+plotBottom) / 2
+	}
+	return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
+}
+
+func chartSeriesBounds(values []float64) (float64, float64) {
+	if len(values) == 0 {
+		return 0, 1
+	}
+	min, max := values[0], values[0]
+	for _, value := range values[1:] {
+		if value < min {
+			min = value
+		}
+		if value > max {
+			max = value
+		}
+	}
+	if min == max {
+		if max == 0 {
+			return 0, 1
+		}
+		pad := math.Abs(max) * 0.1
+		if pad == 0 {
+			pad = 1
+		}
+		min -= pad
+		max += pad
+	}
+	if min > 0 {
+		pad := (max - min) * 0.2
+		if pad == 0 {
+			pad = max * 0.1
+		}
+		min -= pad
+		if min < 0 {
+			min = 0
+		}
+		max += pad
+	}
+	return min, max
+}
+
+func chartNiceTicks(min, max float64, target int) []float64 {
+	if min == max {
+		max = min + 1
+	}
+	span := max - min
+	step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
+	for _, factor := range []float64{1, 2, 5, 10} {
+		if span/(factor*step) <= float64(target)*1.5 {
+			step = factor * step
+			break
+		}
+	}
+	low := math.Floor(min/step) * step
+	high := math.Ceil(max/step) * step
+	var ticks []float64
+	for value := low; value <= high+step*0.001; value += step {
+		ticks = append(ticks, math.Round(value*1e9)/1e9)
+	}
+	return ticks
+}
+
+func valueClamp(value float64, scale chartScale) float64 {
+	if value < scale.Min {
+		return scale.Min
+	}
+	if value > scale.Max {
+		return scale.Max
+	}
+	return value
+}
+
+func chartStatsLabel(datasets [][]float64) string {
+	mn, avg, mx := globalStats(datasets)
+	if mx <= 0 && avg <= 0 && mn <= 0 {
+		return ""
+	}
+	return fmt.Sprintf("min %s   avg %s   max %s",
+		chartLegendNumber(mn),
+		chartLegendNumber(avg),
+		chartLegendNumber(mx),
+	)
+}
+
+func gpuDisplayLabel(idx int) string {
+	if name := gpuModelNameByIndex(idx); name != "" {
+		return fmt.Sprintf("GPU %d — %s", idx, name)
+	}
+	return fmt.Sprintf("GPU %d", idx)
+}
+
+func gpuModelNameByIndex(idx int) string {
+	now := time.Now()
+	gpuLabelCache.mu.Lock()
+	if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
+		gpuLabelCache.loadedAt = now
+		gpuLabelCache.byIndex = loadGPUModelNames()
+	}
+	name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
+	gpuLabelCache.mu.Unlock()
+	return name
+}
+
+func loadGPUModelNames() map[int]string {
+	out := map[int]string{}
+	gpus, err := platform.New().ListNvidiaGPUs()
+	if err != nil {
+		return out
+	}
+	for _, gpu := range gpus {
+		name := strings.TrimSpace(gpu.Name)
+		if name != "" {
+			out[gpu.Index] = name
+		}
+	}
+	return out
+}
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -0,0 +1,144 @@
+package webui
+
+import (
+	"os"
+	"strings"
+	"sync"
+	"time"
+)
+
+// jobState holds the output lines and completion status of an async job.
+type jobState struct {
+	lines        []string
+	done         bool
+	err          string
+	mu           sync.Mutex
+	subs         []chan string
+	cancel       func() // optional cancel function; nil if job is not cancellable
+	logPath      string
+	serialPrefix string
+}
+
+// abort cancels the job if it has a cancel function and is not yet done.
+func (j *jobState) abort() bool {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	if j.done || j.cancel == nil {
+		return false
+	}
+	j.cancel()
+	return true
+}
+
+func (j *jobState) append(line string) {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	j.lines = append(j.lines, line)
+	if j.logPath != "" {
+		appendJobLog(j.logPath, line)
+	}
+	if j.serialPrefix != "" {
+		taskSerialWriteLine(j.serialPrefix + line)
+	}
+	for _, ch := range j.subs {
+		select {
+		case ch <- line:
+		default:
+		}
+	}
+}
+
+func (j *jobState) finish(errMsg string) {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	j.done = true
+	j.err = errMsg
+	for _, ch := range j.subs {
+		close(ch)
+	}
+	j.subs = nil
+}
+
+// subscribe returns a channel that receives all future lines.
+// Existing lines are returned first, then the channel streams new ones.
+func (j *jobState) subscribe() ([]string, <-chan string) {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	existing := make([]string, len(j.lines))
+	copy(existing, j.lines)
+	if j.done {
+		return existing, nil
+	}
+	ch := make(chan string, 256)
+	j.subs = append(j.subs, ch)
+	return existing, ch
+}
+
+// jobManager manages async jobs identified by string IDs.
+type jobManager struct {
+	mu   sync.Mutex
+	jobs map[string]*jobState
+}
+
+var globalJobs = &jobManager{jobs: make(map[string]*jobState)}
+
+func (m *jobManager) create(id string) *jobState {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	j := &jobState{}
+	m.jobs[id] = j
+	// Schedule cleanup after 30 minutes
+	goRecoverOnce("job cleanup", func() {
+		time.Sleep(30 * time.Minute)
+		m.mu.Lock()
+		delete(m.jobs, id)
+		m.mu.Unlock()
+	})
+	return j
+}
+
+// isDone returns true if the job has finished (either successfully or with error).
+func (j *jobState) isDone() bool {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	return j.done
+}
+
+func (m *jobManager) get(id string) (*jobState, bool) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	j, ok := m.jobs[id]
+	return j, ok
+}
+
+func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
+	j := &jobState{logPath: logPath}
+	if len(serialPrefix) > 0 {
+		j.serialPrefix = serialPrefix[0]
+	}
+	if logPath == "" {
+		return j
+	}
+	data, err := os.ReadFile(logPath)
+	if err != nil || len(data) == 0 {
+		return j
+	}
+	lines := strings.Split(strings.ReplaceAll(string(data), "\r\n", "\n"), "\n")
+	if len(lines) > 0 && lines[len(lines)-1] == "" {
+		lines = lines[:len(lines)-1]
+	}
+	j.lines = append(j.lines, lines...)
+	return j
+}
+
+func appendJobLog(path, line string) {
+	if path == "" {
+		return
+	}
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
+	if err != nil {
+		return
+	}
+	defer f.Close()
+	_, _ = f.WriteString(line + "\n")
+}
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -0,0 +1,242 @@
+package webui
+
+import (
+	"bufio"
+	"io"
+	"log/slog"
+	"os"
+	"strings"
+	"sync"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
+// It supports multiple concurrent SAT tasks: a shared event window is open
+// while any SAT task is running, and flushed when all tasks complete.
+type kmsgWatcher struct {
+	mu          sync.Mutex
+	activeCount int // number of in-flight SAT tasks
+	window      *kmsgWindow
+	statusDB    *app.ComponentStatusDB
+}
+
+type kmsgWindow struct {
+	targets   []string // SAT targets running concurrently
+	startedAt time.Time
+	seen      map[kmsgEventKey]bool
+	events    []kmsgEvent
+}
+
+type kmsgEventKey struct {
+	id       string // BDF or device name
+	category string
+}
+
+type kmsgEvent struct {
+	timestamp time.Time
+	raw       string
+	ids       []string // BDF addresses or device names extracted
+	category  string
+}
+
+func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
+	return &kmsgWatcher{statusDB: statusDB}
+}
+
+// start launches the background kmsg reading goroutine.
+func (w *kmsgWatcher) start() {
+	goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
+}
+
+func (w *kmsgWatcher) run() {
+	for {
+		f, err := os.Open("/dev/kmsg")
+		if err != nil {
+			slog.Warn("kmsg watcher unavailable", "err", err)
+			time.Sleep(30 * time.Second)
+			continue
+		}
+		// Best-effort seek to end so we only capture events from now forward.
+		_, _ = f.Seek(0, io.SeekEnd)
+
+		scanner := bufio.NewScanner(f)
+		scanner.Buffer(make([]byte, 64*1024), 64*1024)
+		for scanner.Scan() {
+			line := scanner.Text()
+			evt, ok := parseKmsgLine(line)
+			if !ok {
+				continue
+			}
+			w.mu.Lock()
+			if w.window != nil {
+				w.recordEvent(evt)
+			}
+			w.mu.Unlock()
+		}
+		if err := scanner.Err(); err != nil {
+			slog.Warn("kmsg watcher stopped", "err", err)
+		}
+		_ = f.Close()
+		time.Sleep(2 * time.Second)
+	}
+}
+
+// recordEvent appends evt to the active window, deduplicating by (id, category).
+// Must be called with w.mu held.
+func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
+	if len(evt.ids) == 0 {
+		key := kmsgEventKey{id: "", category: evt.category}
+		if !w.window.seen[key] {
+			w.window.seen[key] = true
+			w.window.events = append(w.window.events, evt)
+		}
+		return
+	}
+	for _, id := range evt.ids {
+		key := kmsgEventKey{id: id, category: evt.category}
+		if !w.window.seen[key] {
+			w.window.seen[key] = true
+			w.window.events = append(w.window.events, evt)
+		}
+	}
+}
+
+// NotifyTaskStarted increments the active task counter and opens a shared event window
+// if this is the first task starting.
+func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.activeCount == 0 {
+		w.window = &kmsgWindow{
+			startedAt: time.Now(),
+			seen:      make(map[kmsgEventKey]bool),
+		}
+	}
+	w.activeCount++
+	if w.window != nil {
+		w.window.targets = append(w.window.targets, target)
+	}
+}
+
+// NotifyTaskFinished decrements the active task counter. When all tasks finish,
+// it flushes the accumulated events to the status DB.
+func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
+	w.mu.Lock()
+	w.activeCount--
+	var window *kmsgWindow
+	if w.activeCount <= 0 {
+		w.activeCount = 0
+		window = w.window
+		w.window = nil
+	}
+	w.mu.Unlock()
+
+	if window == nil || len(window.events) == 0 {
+		return
+	}
+	goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
+}
+
+func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
+	if w.statusDB == nil {
+		return
+	}
+	source := "watchdog:kmsg"
+	// Collect unique component keys from events.
+	seen := map[string]string{} // componentKey → first raw line
+	for _, evt := range window.events {
+		if len(evt.ids) == 0 {
+			// MCE or un-identified error.
+			key := "cpu:all"
+			if evt.category == "memory" {
+				key = "memory:all"
+			}
+			if _, exists := seen[key]; !exists {
+				seen[key] = evt.raw
+			}
+			continue
+		}
+		for _, id := range evt.ids {
+			var key string
+			switch evt.category {
+			case "gpu", "pcie":
+				key = "pcie:" + normalizeBDF(id)
+			case "storage":
+				key = "storage:" + id
+			default:
+				key = "pcie:" + normalizeBDF(id)
+			}
+			if _, exists := seen[key]; !exists {
+				seen[key] = evt.raw
+			}
+		}
+	}
+	for key, detail := range seen {
+		detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
+		w.statusDB.Record(key, source, "Warning", detail)
+	}
+}
+
+// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
+// any pattern in platform.HardwareErrorPatterns.
+// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
+func parseKmsgLine(raw string) (kmsgEvent, bool) {
+	msg := raw
+	if idx := strings.Index(raw, ";"); idx >= 0 {
+		msg = strings.TrimSpace(raw[idx+1:])
+	}
+	if msg == "" {
+		return kmsgEvent{}, false
+	}
+
+	for _, p := range platform.HardwareErrorPatterns {
+		m := p.Re.FindStringSubmatch(msg)
+		if m == nil {
+			continue
+		}
+		evt := kmsgEvent{
+			timestamp: time.Now(),
+			raw:       msg,
+			category:  p.Category,
+		}
+		if p.BDFGroup > 0 && p.BDFGroup < len(m) {
+			evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
+		}
+		if p.DevGroup > 0 && p.DevGroup < len(m) {
+			evt.ids = append(evt.ids, m[p.DevGroup])
+		}
+		return evt, true
+	}
+	return kmsgEvent{}, false
+}
+
+// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
+func normalizeBDF(bdf string) string {
+	bdf = strings.ToLower(strings.TrimSpace(bdf))
+	if strings.Count(bdf, ":") == 1 {
+		return "0000:" + bdf
+	}
+	return bdf
+}
+
+func truncate(s string, max int) string {
+	if len(s) <= max {
+		return s
+	}
+	return s[:max] + "..."
+}
+
+// isSATTarget returns true for task targets that run hardware acceptance tests.
+func isSATTarget(target string) bool {
+	switch target {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
+		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
+		"platform-stress":
+		return true
+	}
+	return false
+}
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -0,0 +1,393 @@
+package webui
+
+import (
+	"database/sql"
+	"encoding/csv"
+	"io"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
+	_ "modernc.org/sqlite"
+)
+
+const metricsDBPath = "/appdata/bee/metrics.db"
+
+// MetricsDB persists live metric samples to SQLite.
+type MetricsDB struct {
+	db *sql.DB
+}
+
+func (m *MetricsDB) Close() error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	return m.db.Close()
+}
+
+// openMetricsDB opens (or creates) the metrics database at the given path.
+func openMetricsDB(path string) (*MetricsDB, error) {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return nil, err
+	}
+	db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
+	if err != nil {
+		return nil, err
+	}
+	db.SetMaxOpenConns(1)
+	if err := initMetricsSchema(db); err != nil {
+		_ = db.Close()
+		return nil, err
+	}
+	return &MetricsDB{db: db}, nil
+}
+
+func initMetricsSchema(db *sql.DB) error {
+	_, err := db.Exec(`
+CREATE TABLE IF NOT EXISTS sys_metrics (
+  ts           INTEGER NOT NULL,
+  cpu_load_pct REAL,
+  mem_load_pct REAL,
+  power_w      REAL,
+  PRIMARY KEY (ts)
+);
+CREATE TABLE IF NOT EXISTS gpu_metrics (
+  ts            INTEGER NOT NULL,
+  gpu_index     INTEGER NOT NULL,
+  temp_c        REAL,
+  usage_pct     REAL,
+  mem_usage_pct REAL,
+  power_w       REAL,
+  clock_mhz     REAL,
+  mem_clock_mhz REAL,
+  PRIMARY KEY (ts, gpu_index)
+);
+CREATE TABLE IF NOT EXISTS fan_metrics (
+  ts   INTEGER NOT NULL,
+  name TEXT NOT NULL,
+  rpm  REAL,
+  PRIMARY KEY (ts, name)
+);
+CREATE TABLE IF NOT EXISTS temp_metrics (
+  ts      INTEGER NOT NULL,
+  name    TEXT NOT NULL,
+  grp     TEXT NOT NULL,
+  celsius REAL,
+  PRIMARY KEY (ts, name)
+);
+`)
+	if err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
+		return err
+	}
+	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+}
+
+func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
+	rows, err := db.Query("PRAGMA table_info(" + table + ")")
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+
+	for rows.Next() {
+		var cid int
+		var name, ctype string
+		var notNull, pk int
+		var dflt sql.NullString
+		if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
+			return err
+		}
+		if strings.EqualFold(name, column) {
+			return nil
+		}
+	}
+	if err := rows.Err(); err != nil {
+		return err
+	}
+	_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
+	return err
+}
+
+// Write inserts one sample into all relevant tables.
+func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
+	ts := s.Timestamp.Unix()
+	tx, err := m.db.Begin()
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+
+	_, err = tx.Exec(
+		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
+		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
+	)
+	if err != nil {
+		return err
+	}
+	for _, g := range s.GPUs {
+		_, err = tx.Exec(
+			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
+			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
+		)
+		if err != nil {
+			return err
+		}
+	}
+	for _, f := range s.Fans {
+		_, err = tx.Exec(
+			`INSERT OR REPLACE INTO fan_metrics(ts,name,rpm) VALUES(?,?,?)`,
+			ts, f.Name, f.RPM,
+		)
+		if err != nil {
+			return err
+		}
+	}
+	for _, t := range s.Temps {
+		_, err = tx.Exec(
+			`INSERT OR REPLACE INTO temp_metrics(ts,name,grp,celsius) VALUES(?,?,?,?)`,
+			ts, t.Name, t.Group, t.Celsius,
+		)
+		if err != nil {
+			return err
+		}
+	}
+	return tx.Commit()
+}
+
+// LoadRecent returns up to n samples in chronological order (oldest first).
+func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
+}
+
+// LoadAll returns all persisted samples in chronological order (oldest first).
+func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+}
+
+// LoadBetween returns samples in chronological order within the given time window.
+func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
+	if m == nil {
+		return nil, nil
+	}
+	if start.IsZero() || end.IsZero() {
+		return nil, nil
+	}
+	if end.Before(start) {
+		start, end = end, start
+	}
+	return m.loadSamples(
+		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
+		start.Unix(), end.Unix(),
+	)
+}
+
+// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
+func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
+	rows, err := m.db.Query(query, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	type sysRow struct {
+		ts            int64
+		cpu, mem, pwr float64
+	}
+	var sysRows []sysRow
+	for rows.Next() {
+		var r sysRow
+		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
+			continue
+		}
+		sysRows = append(sysRows, r)
+	}
+	if len(sysRows) == 0 {
+		return nil, nil
+	}
+	// Collect min/max ts for range query
+	minTS := sysRows[0].ts
+	maxTS := sysRows[len(sysRows)-1].ts
+
+	// Load GPU rows in range
+	type gpuKey struct {
+		ts  int64
+		idx int
+	}
+	gpuData := map[gpuKey]platform.GPUMetricRow{}
+	gRows, err := m.db.Query(
+		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
+		minTS, maxTS,
+	)
+	if err == nil {
+		defer gRows.Close()
+		for gRows.Next() {
+			var ts int64
+			var g platform.GPUMetricRow
+			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
+				gpuData[gpuKey{ts, g.GPUIndex}] = g
+			}
+		}
+	}
+
+	// Load fan rows in range
+	type fanKey struct {
+		ts   int64
+		name string
+	}
+	fanData := map[fanKey]float64{}
+	fRows, err := m.db.Query(
+		`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
+	)
+	if err == nil {
+		defer fRows.Close()
+		for fRows.Next() {
+			var ts int64
+			var name string
+			var rpm float64
+			if err := fRows.Scan(&ts, &name, &rpm); err == nil {
+				fanData[fanKey{ts, name}] = rpm
+			}
+		}
+	}
+
+	// Load temp rows in range
+	type tempKey struct {
+		ts   int64
+		name string
+	}
+	tempData := map[tempKey]platform.TempReading{}
+	tRows, err := m.db.Query(
+		`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
+	)
+	if err == nil {
+		defer tRows.Close()
+		for tRows.Next() {
+			var ts int64
+			var t platform.TempReading
+			if err := tRows.Scan(&ts, &t.Name, &t.Group, &t.Celsius); err == nil {
+				tempData[tempKey{ts, t.Name}] = t
+			}
+		}
+	}
+
+	// Collect unique GPU indices and fan/temp names from loaded data.
+	// Sort each list so that sample reconstruction is deterministic regardless
+	// of Go's non-deterministic map iteration order.
+	seenGPU := map[int]bool{}
+	var gpuIndices []int
+	for k := range gpuData {
+		if !seenGPU[k.idx] {
+			seenGPU[k.idx] = true
+			gpuIndices = append(gpuIndices, k.idx)
+		}
+	}
+	sort.Ints(gpuIndices)
+
+	seenFan := map[string]bool{}
+	var fanNames []string
+	for k := range fanData {
+		if !seenFan[k.name] {
+			seenFan[k.name] = true
+			fanNames = append(fanNames, k.name)
+		}
+	}
+	sort.Strings(fanNames)
+
+	seenTemp := map[string]bool{}
+	var tempNames []string
+	for k := range tempData {
+		if !seenTemp[k.name] {
+			seenTemp[k.name] = true
+			tempNames = append(tempNames, k.name)
+		}
+	}
+	sort.Strings(tempNames)
+
+	samples := make([]platform.LiveMetricSample, len(sysRows))
+	for i, r := range sysRows {
+		s := platform.LiveMetricSample{
+			Timestamp:  time.Unix(r.ts, 0).UTC(),
+			CPULoadPct: r.cpu,
+			MemLoadPct: r.mem,
+			PowerW:     r.pwr,
+		}
+		for _, idx := range gpuIndices {
+			if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
+				s.GPUs = append(s.GPUs, g)
+			}
+		}
+		for _, name := range fanNames {
+			if rpm, ok := fanData[fanKey{r.ts, name}]; ok {
+				s.Fans = append(s.Fans, platform.FanReading{Name: name, RPM: rpm})
+			}
+		}
+		for _, name := range tempNames {
+			if t, ok := tempData[tempKey{r.ts, name}]; ok {
+				s.Temps = append(s.Temps, t)
+			}
+		}
+		samples[i] = s
+	}
+	return samples, nil
+}
+
+// ExportCSV writes all sys+gpu data as CSV to w.
+func (m *MetricsDB) ExportCSV(w io.Writer) error {
+	rows, err := m.db.Query(`
+		SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
+		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
+		       g.clock_mhz, g.mem_clock_mhz
+		FROM sys_metrics s
+		LEFT JOIN gpu_metrics g ON g.ts = s.ts
+		ORDER BY s.ts, g.gpu_index
+	`)
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+
+	cw := csv.NewWriter(w)
+	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
+	for rows.Next() {
+		var ts int64
+		var cpu, mem, pwr float64
+		var gpuIdx sql.NullInt64
+		var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
+		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
+			continue
+		}
+		row := []string{
+			strconv.FormatInt(ts, 10),
+			strconv.FormatFloat(cpu, 'f', 2, 64),
+			strconv.FormatFloat(mem, 'f', 2, 64),
+			strconv.FormatFloat(pwr, 'f', 1, 64),
+		}
+		if gpuIdx.Valid {
+			row = append(row,
+				strconv.FormatInt(gpuIdx.Int64, 10),
+				strconv.FormatFloat(gpuTemp.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
+			)
+		} else {
+			row = append(row, "", "", "", "", "", "", "")
+		}
+		_ = cw.Write(row)
+	}
+	cw.Flush()
+	return cw.Error()
+}
+
+func nullFloat(v float64) sql.NullFloat64 {
+	return sql.NullFloat64{Float64: v, Valid: true}
+}
--- a/audit/internal/webui/metricsdb_test.go
+++ b/audit/internal/webui/metricsdb_test.go
@@ -0,0 +1,174 @@
+package webui
+
+import (
+	"database/sql"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"bee/audit/internal/platform"
+	_ "modernc.org/sqlite"
+)
+
+func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
+	db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	base := time.Unix(1_700_000_000, 0).UTC()
+	for i := 0; i < 3; i++ {
+		err := db.Write(platform.LiveMetricSample{
+			Timestamp:  base.Add(time.Duration(i) * time.Second),
+			CPULoadPct: float64(10 + i),
+			MemLoadPct: float64(20 + i),
+			PowerW:     float64(300 + i),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, PowerW: float64(100 + i)},
+				{GPUIndex: 2, PowerW: float64(200 + i)},
+			},
+		})
+		if err != nil {
+			t.Fatalf("Write(%d): %v", i, err)
+		}
+	}
+
+	all, err := db.LoadAll()
+	if err != nil {
+		t.Fatalf("LoadAll: %v", err)
+	}
+	if len(all) != 3 {
+		t.Fatalf("LoadAll len=%d want 3", len(all))
+	}
+	for i, sample := range all {
+		if len(sample.GPUs) != 2 {
+			t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
+		}
+		if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
+			t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
+		}
+		if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
+			t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
+		}
+	}
+
+	recent, err := db.LoadRecent(2)
+	if err != nil {
+		t.Fatalf("LoadRecent: %v", err)
+	}
+	if len(recent) != 2 {
+		t.Fatalf("LoadRecent len=%d want 2", len(recent))
+	}
+	if !recent[0].Timestamp.Before(recent[1].Timestamp) {
+		t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
+	}
+	for i, sample := range recent {
+		if len(sample.GPUs) != 2 {
+			t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
+		}
+	}
+}
+
+func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "metrics.db")
+	raw, err := sql.Open("sqlite", path)
+	if err != nil {
+		t.Fatalf("sql.Open: %v", err)
+	}
+	_, err = raw.Exec(`
+CREATE TABLE gpu_metrics (
+  ts            INTEGER NOT NULL,
+  gpu_index     INTEGER NOT NULL,
+  temp_c        REAL,
+  usage_pct     REAL,
+  mem_usage_pct REAL,
+  power_w       REAL,
+  PRIMARY KEY (ts, gpu_index)
+);
+CREATE TABLE sys_metrics (
+  ts           INTEGER NOT NULL,
+  cpu_load_pct REAL,
+  mem_load_pct REAL,
+  power_w      REAL,
+  PRIMARY KEY (ts)
+);
+CREATE TABLE fan_metrics (
+  ts   INTEGER NOT NULL,
+  name TEXT NOT NULL,
+  rpm  REAL,
+  PRIMARY KEY (ts, name)
+);
+CREATE TABLE temp_metrics (
+  ts      INTEGER NOT NULL,
+  name    TEXT NOT NULL,
+  grp     TEXT NOT NULL,
+  celsius REAL,
+  PRIMARY KEY (ts, name)
+);
+`)
+	if err != nil {
+		t.Fatalf("create legacy schema: %v", err)
+	}
+	_ = raw.Close()
+
+	db, err := openMetricsDB(path)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	now := time.Unix(1_700_000_100, 0).UTC()
+	err = db.Write(platform.LiveMetricSample{
+		Timestamp: now,
+		GPUs: []platform.GPUMetricRow{
+			{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
+		},
+	})
+	if err != nil {
+		t.Fatalf("Write: %v", err)
+	}
+
+	samples, err := db.LoadAll()
+	if err != nil {
+		t.Fatalf("LoadAll: %v", err)
+	}
+	if len(samples) != 1 || len(samples[0].GPUs) != 1 {
+		t.Fatalf("samples=%+v", samples)
+	}
+	if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
+		t.Fatalf("ClockMHz=%v want 1410", got)
+	}
+	if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
+		t.Fatalf("MemClockMHz=%v want 2600", got)
+	}
+}
+
+func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
+	db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	base := time.Unix(1_700_000_000, 0).UTC()
+	for i := 0; i < 5; i++ {
+		if err := db.Write(platform.LiveMetricSample{
+			Timestamp:  base.Add(time.Duration(i) * time.Minute),
+			CPULoadPct: float64(i),
+		}); err != nil {
+			t.Fatalf("Write(%d): %v", i, err)
+		}
+	}
+
+	got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
+	if err != nil {
+		t.Fatalf("LoadBetween: %v", err)
+	}
+	if len(got) != 3 {
+		t.Fatalf("LoadBetween len=%d want 3", len(got))
+	}
+	if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
+		t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
+	}
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/serial_console.go
+++ b/audit/internal/webui/serial_console.go
@@ -0,0 +1,41 @@
+package webui
+
+import (
+	"fmt"
+	"os"
+	"strings"
+	"time"
+)
+
+var taskSerialWriteLine = writeTaskSerialLine
+
+func writeTaskSerialLine(line string) {
+	line = strings.TrimSpace(line)
+	if line == "" {
+		return
+	}
+	payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
+	for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
+		f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
+		if err != nil {
+			continue
+		}
+		_, _ = f.WriteString(payload)
+		_ = f.Close()
+		return
+	}
+}
+
+func taskSerialPrefix(t *Task) string {
+	if t == nil {
+		return "[task] "
+	}
+	return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
+}
+
+func taskSerialEvent(t *Task, event string) {
+	if t == nil {
+		return
+	}
+	taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
+}
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -0,0 +1,42 @@
+package webui
+
+import (
+	"fmt"
+	"log/slog"
+	"runtime/debug"
+	"time"
+)
+
+func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
+	go func() {
+		for {
+			if !runRecoverable(name, fn) {
+				return
+			}
+			if restartDelay > 0 {
+				time.Sleep(restartDelay)
+			}
+		}
+	}()
+}
+
+func goRecoverOnce(name string, fn func()) {
+	go func() {
+		_ = runRecoverable(name, fn)
+	}()
+}
+
+func runRecoverable(name string, fn func()) (panicked bool) {
+	defer func() {
+		if rec := recover(); rec != nil {
+			panicked = true
+			slog.Error("recovered panic",
+				"component", name,
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+		}
+	}()
+	fn()
+	return false
+}
--- a/audit/internal/webui/task_page.go
+++ b/audit/internal/webui/task_page.go
@@ -0,0 +1,267 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"net/http"
+	"os"
+	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
+	id := r.PathValue("id")
+	task, ok := globalQueue.findByID(id)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	snapshot := *task
+	body := renderTaskDetailPage(h.opts, snapshot)
+	w.Header().Set("Cache-Control", "no-store")
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	_, _ = w.Write([]byte(body))
+}
+
+func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
+	task, samples, _, _, ok := h.taskSamplesForRequest(r)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	type taskChartIndexEntry struct {
+		Title string `json:"title"`
+		File  string `json:"file"`
+	}
+	entries := make([]taskChartIndexEntry, 0)
+	for _, spec := range taskChartSpecsForSamples(samples) {
+		title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
+		if !ok {
+			continue
+		}
+		entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
+	}
+	w.Header().Set("Cache-Control", "no-store")
+	w.Header().Set("Content-Type", "application/json; charset=utf-8")
+	_ = json.NewEncoder(w).Encode(entries)
+}
+
+func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
+	task, samples, _, _, ok := h.taskSamplesForRequest(r)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
+	path, ok := taskChartPathFromFile(file)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
+	if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
+		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
+		return
+	}
+	w.Header().Set("Content-Type", "image/svg+xml")
+	w.Header().Set("Cache-Control", "no-store")
+	_, _ = w.Write(buf)
+}
+
+func renderTaskDetailPage(opts HandlerOptions, task Task) string {
+	title := task.Name
+	if strings.TrimSpace(title) == "" {
+		title = task.ID
+	}
+	var body strings.Builder
+	body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
+	body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
+	if task.Status == TaskRunning || task.Status == TaskPending {
+		body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
+	}
+	body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
+	body.WriteString(`</div>`)
+
+	if report := loadTaskReportFragment(task); report != "" {
+		body.WriteString(report)
+	} else {
+		body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
+		body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
+		body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
+		if strings.TrimSpace(task.ErrMsg) != "" {
+			body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
+		}
+		body.WriteString(`</div></div>`)
+	}
+
+	if task.Status == TaskRunning {
+		body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
+		body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
+		body.WriteString(`</div></div>`)
+	}
+
+	if task.Status == TaskRunning || task.Status == TaskPending {
+		body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
+		body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
+		body.WriteString(`</div></div>`)
+		body.WriteString(`<script>
+function cancelTaskDetail(id) {
+  fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
+    var term = document.getElementById('task-live-log');
+    if (term) {
+      term.textContent += '\nCancel requested.\n';
+      term.scrollTop = term.scrollHeight;
+    }
+  });
+}
+function renderTaskLiveCharts(taskId, charts) {
+  const host = document.getElementById('task-live-charts');
+  if (!host) return;
+  if (!Array.isArray(charts) || charts.length === 0) {
+    host.innerHTML = 'Waiting for metric samples...';
+    return;
+  }
+  const seen = {};
+  charts.forEach(function(chart) {
+    seen[chart.file] = true;
+    let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
+    if (img) {
+      const card = img.closest('.card');
+      if (card) {
+        const title = card.querySelector('.card-head');
+        if (title) title.textContent = chart.title;
+      }
+      return;
+    }
+    const card = document.createElement('div');
+    card.className = 'card';
+    card.style.margin = '0';
+    card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
+    card.querySelector('.card-head').textContent = chart.title;
+    const body = card.querySelector('.card-body');
+    img = document.createElement('img');
+    img.setAttribute('data-task-chart', '1');
+    img.setAttribute('data-chart-file', chart.file);
+    img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
+    img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
+    img.style.width = '100%';
+    img.style.display = 'block';
+    img.style.borderRadius = '6px';
+    img.alt = chart.title;
+    body.appendChild(img);
+    host.appendChild(card);
+  });
+  Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
+    const file = img.getAttribute('data-chart-file') || '';
+    if (seen[file]) return;
+    const card = img.closest('.card');
+    if (card) card.remove();
+  });
+}
+function loadTaskLiveCharts(taskId) {
+  fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
+    renderTaskLiveCharts(taskId, charts);
+  }).catch(function(){
+    const host = document.getElementById('task-live-charts');
+    if (host) host.innerHTML = 'Task charts are unavailable.';
+  });
+}
+function refreshTaskLiveCharts() {
+  document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
+    const base = img.dataset.baseSrc;
+    if (!base) return;
+    img.src = base + '?t=' + Date.now();
+  });
+}
+var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
+var _taskDetailTerm = document.getElementById('task-live-log');
+var _taskChartTimer = null;
+var _taskChartsFrozen = false;
+_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
+_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
+_taskDetailES.addEventListener('done', function(e){
+  if (_taskChartTimer) clearInterval(_taskChartTimer);
+  _taskDetailES.close();
+  _taskDetailES = null;
+  _taskChartsFrozen = true;
+  _taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
+  _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
+  refreshTaskLiveCharts();
+});
+_taskDetailES.onerror = function(){
+  if (_taskChartTimer) clearInterval(_taskChartTimer);
+  if (_taskDetailES) {
+    _taskDetailES.close();
+    _taskDetailES = null;
+  }
+};
+loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
+_taskChartTimer = setInterval(function(){
+  if (_taskChartsFrozen) return;
+  loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
+  refreshTaskLiveCharts();
+}, 2000);
+</script>`)
+	}
+
+	return layoutHead(opts.Title+" — "+title) +
+		layoutNav("tasks", opts.BuildLabel) +
+		`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
+		body.String() +
+		`</div></div></body></html>`
+}
+
+func loadTaskReportFragment(task Task) string {
+	if strings.TrimSpace(task.ReportHTMLPath) == "" {
+		return ""
+	}
+	data, err := os.ReadFile(task.ReportHTMLPath)
+	if err != nil || len(data) == 0 {
+		return ""
+	}
+	return string(data)
+}
+
+func taskArtifactDownloadLink(task Task, absPath string) string {
+	if strings.TrimSpace(absPath) == "" {
+		return ""
+	}
+	return fmt.Sprintf(`/export/file?path=%s`, absPath)
+}
+
+func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
+	id := r.PathValue("id")
+	taskPtr, ok := globalQueue.findByID(id)
+	if !ok {
+		return Task{}, nil, time.Time{}, time.Time{}, false
+	}
+	task := *taskPtr
+	start, end := taskTimeWindow(&task)
+	samples, err := loadTaskMetricSamples(start, end)
+	if err != nil {
+		return task, nil, start, end, true
+	}
+	return task, samples, start, end, true
+}
+
+func taskTimelineForTask(task Task) []chartTimelineSegment {
+	start, end := taskTimeWindow(&task)
+	return []chartTimelineSegment{{Start: start, End: end, Active: true}}
+}
+
+func taskChartPathFromFile(file string) (string, bool) {
+	file = strings.TrimSpace(file)
+	for _, spec := range taskDashboardChartSpecs {
+		if spec.File == file {
+			return spec.Path, true
+		}
+	}
+	if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
+		id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
+		return "gpu/" + id + "-overview", true
+	}
+	return "", false
+}
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -0,0 +1,343 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+var taskReportMetricsDBPath = metricsDBPath
+
+type taskReport struct {
+	ID          string            `json:"id"`
+	Name        string            `json:"name"`
+	Target      string            `json:"target"`
+	Status      string            `json:"status"`
+	CreatedAt   time.Time         `json:"created_at"`
+	StartedAt   *time.Time        `json:"started_at,omitempty"`
+	DoneAt      *time.Time        `json:"done_at,omitempty"`
+	DurationSec int               `json:"duration_sec,omitempty"`
+	Error       string            `json:"error,omitempty"`
+	LogFile     string            `json:"log_file,omitempty"`
+	Charts      []taskReportChart `json:"charts,omitempty"`
+	GeneratedAt time.Time         `json:"generated_at"`
+}
+
+type taskReportChart struct {
+	Title string `json:"title"`
+	File  string `json:"file"`
+}
+
+type taskChartSpec struct {
+	Path string
+	File string
+}
+
+var taskDashboardChartSpecs = []taskChartSpec{
+	{Path: "server-load", File: "server-load.svg"},
+	{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
+	{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
+	{Path: "server-power", File: "server-power.svg"},
+	{Path: "server-fans", File: "server-fans.svg"},
+	{Path: "gpu-all-load", File: "gpu-all-load.svg"},
+	{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
+	{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
+	{Path: "gpu-all-power", File: "gpu-all-power.svg"},
+	{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
+}
+
+func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
+	specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
+	specs = append(specs, taskDashboardChartSpecs...)
+	for _, idx := range taskGPUIndices(samples) {
+		specs = append(specs, taskChartSpec{
+			Path: fmt.Sprintf("gpu/%d-overview", idx),
+			File: fmt.Sprintf("gpu-%d-overview.svg", idx),
+		})
+	}
+	return specs
+}
+
+func writeTaskReportArtifacts(t *Task) error {
+	if t == nil {
+		return nil
+	}
+	ensureTaskReportPaths(t)
+	if strings.TrimSpace(t.ArtifactsDir) == "" {
+		return nil
+	}
+	if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
+		return err
+	}
+
+	start, end := taskTimeWindow(t)
+	samples, _ := loadTaskMetricSamples(start, end)
+	charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
+
+	logText := ""
+	if data, err := os.ReadFile(t.LogPath); err == nil {
+		logText = string(data)
+	}
+
+	report := taskReport{
+		ID:          t.ID,
+		Name:        t.Name,
+		Target:      t.Target,
+		Status:      t.Status,
+		CreatedAt:   t.CreatedAt,
+		StartedAt:   t.StartedAt,
+		DoneAt:      t.DoneAt,
+		DurationSec: taskElapsedSec(t, reportDoneTime(t)),
+		Error:       t.ErrMsg,
+		LogFile:     filepath.Base(t.LogPath),
+		Charts:      charts,
+		GeneratedAt: time.Now().UTC(),
+	}
+	if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
+		return err
+	}
+	return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
+}
+
+func reportDoneTime(t *Task) time.Time {
+	if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
+		return *t.DoneAt
+	}
+	return time.Now()
+}
+
+func taskTimeWindow(t *Task) (time.Time, time.Time) {
+	if t == nil {
+		now := time.Now().UTC()
+		return now, now
+	}
+	start := t.CreatedAt.UTC()
+	if t.StartedAt != nil && !t.StartedAt.IsZero() {
+		start = t.StartedAt.UTC()
+	}
+	end := time.Now().UTC()
+	if t.DoneAt != nil && !t.DoneAt.IsZero() {
+		end = t.DoneAt.UTC()
+	}
+	if end.Before(start) {
+		end = start
+	}
+	return start, end
+}
+
+func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
+	db, err := openMetricsDB(taskReportMetricsDBPath)
+	if err != nil {
+		return nil, err
+	}
+	defer db.Close()
+	return db.LoadBetween(start, end)
+}
+
+func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
+	if len(samples) == 0 {
+		return nil, nil
+	}
+	timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
+	var charts []taskReportChart
+	inline := make(map[string]string)
+	for _, spec := range taskChartSpecsForSamples(samples) {
+		title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
+		if !ok || len(svg) == 0 {
+			continue
+		}
+		path := filepath.Join(dir, spec.File)
+		if err := os.WriteFile(path, svg, 0644); err != nil {
+			continue
+		}
+		charts = append(charts, taskReportChart{Title: title, File: spec.File})
+		inline[spec.File] = string(svg)
+	}
+	return charts, inline
+}
+
+func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
+	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
+		buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		if err != nil || !hasData {
+			return "", nil, false
+		}
+		return gpuDisplayLabel(idx) + " Overview", buf, true
+	}
+	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	if !ok {
+		return "", nil, false
+	}
+	buf, err := renderMetricChartSVG(
+		title,
+		labels,
+		sampleTimes(samples),
+		datasets,
+		names,
+		yMin,
+		yMax,
+		chartCanvasHeightForPath(path, len(names)),
+		timeline,
+	)
+	if err != nil {
+		return "", nil, false
+	}
+	return title, buf, true
+}
+
+func taskGPUIndices(samples []platform.LiveMetricSample) []int {
+	seen := map[int]bool{}
+	var out []int
+	for _, s := range samples {
+		for _, g := range s.GPUs {
+			if seen[g.GPUIndex] {
+				continue
+			}
+			seen[g.GPUIndex] = true
+			out = append(out, g.GPUIndex)
+		}
+	}
+	sort.Ints(out)
+	return out
+}
+
+func writeJSONFile(path string, v any) error {
+	data, err := json.MarshalIndent(v, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0644)
+}
+
+func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
+	var b strings.Builder
+	b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
+	b.WriteString(`<div class="grid2">`)
+	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
+	b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
+	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
+	if strings.TrimSpace(report.Error) != "" {
+		b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
+	}
+	b.WriteString(`</div></div>`)
+	b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
+	b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
+	b.WriteString(`</div></div></div>`)
+	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
+		b.WriteString(benchmarkCard)
+	}
+
+	if len(report.Charts) > 0 {
+		for _, chart := range report.Charts {
+			b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
+			b.WriteString(charts[chart.File])
+			b.WriteString(`</div></div>`)
+		}
+	} else {
+		b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
+	}
+
+	b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
+	b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
+	b.WriteString(`</div></div>`)
+	return b.String()
+}
+
+func renderTaskBenchmarkResultsCard(target, logText string) string {
+	if strings.TrimSpace(target) != "nvidia-benchmark" {
+		return ""
+	}
+	resultPath := taskBenchmarkResultPath(logText)
+	if strings.TrimSpace(resultPath) == "" {
+		return ""
+	}
+	columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
+	if len(runs) == 0 {
+		return ""
+	}
+	return renderBenchmarkResultsCardFromRuns(
+		"Benchmark Results",
+		"Composite score for this benchmark task.",
+		"No benchmark results were saved for this task.",
+		columns,
+		runs,
+	)
+}
+
+func taskBenchmarkResultPath(logText string) string {
+	archivePath := taskArchivePathFromLog(logText)
+	if archivePath == "" {
+		return ""
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	if runDir == archivePath {
+		return ""
+	}
+	return filepath.Join(runDir, "result.json")
+}
+
+func taskArchivePathFromLog(logText string) string {
+	lines := strings.Split(logText, "\n")
+	for i := len(lines) - 1; i >= 0; i-- {
+		line := strings.TrimSpace(lines[i])
+		if line == "" || !strings.HasPrefix(line, "Archive:") {
+			continue
+		}
+		path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
+		if strings.HasPrefix(path, "Archive written to ") {
+			path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
+		}
+		if strings.HasSuffix(path, ".tar.gz") {
+			return path
+		}
+	}
+	return ""
+}
+
+func renderTaskStatusBadge(status string) string {
+	className := map[string]string{
+		TaskRunning:   "badge-ok",
+		TaskPending:   "badge-unknown",
+		TaskDone:      "badge-ok",
+		TaskFailed:    "badge-err",
+		TaskCancelled: "badge-unknown",
+	}[status]
+	if className == "" {
+		className = "badge-unknown"
+	}
+	label := strings.TrimSpace(status)
+	if label == "" {
+		label = "unknown"
+	}
+	return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
+}
+
+func formatTaskTime(ts *time.Time, fallback time.Time) string {
+	if ts != nil && !ts.IsZero() {
+		return ts.Local().Format("2006-01-02 15:04:05")
+	}
+	if !fallback.IsZero() {
+		return fallback.Local().Format("2006-01-02 15:04:05")
+	}
+	return "n/a"
+}
+
+func formatTaskDuration(sec int) string {
+	if sec <= 0 {
+		return "n/a"
+	}
+	if sec < 60 {
+		return fmt.Sprintf("%ds", sec)
+	}
+	if sec < 3600 {
+		return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
+	}
+	return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
+}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -0,0 +1,821 @@
+package webui
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
+	dir := t.TempDir()
+	q := &taskQueue{
+		statePath: filepath.Join(dir, "tasks-state.json"),
+		logsDir:   filepath.Join(dir, "tasks"),
+		trigger:   make(chan struct{}, 1),
+	}
+	if err := os.MkdirAll(q.logsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	started := time.Now().Add(-time.Minute)
+
+	// A task that was pending (not yet started) must be re-queued on restart.
+	pendingTask := &Task{
+		ID:        "task-pending",
+		Name:      "Memory Burn-in",
+		Target:    "memory-stress",
+		Priority:  2,
+		Status:    TaskPending,
+		CreatedAt: time.Now().Add(-2 * time.Minute),
+		params:    taskParams{Duration: 300, BurnProfile: "smoke"},
+	}
+	// A task that was running when bee-web crashed must NOT be re-queued —
+	// its child processes (e.g. gpu-burn-worker) survive the restart in
+	// their own process groups and can't be cancelled retroactively.
+	runningTask := &Task{
+		ID:        "task-running",
+		Name:      "NVIDIA GPU Stress",
+		Target:    "nvidia-stress",
+		Priority:  1,
+		Status:    TaskRunning,
+		CreatedAt: time.Now().Add(-3 * time.Minute),
+		StartedAt: &started,
+		params:    taskParams{Duration: 86400},
+	}
+	for _, task := range []*Task{pendingTask, runningTask} {
+		q.tasks = append(q.tasks, task)
+		q.assignTaskLogPathLocked(task)
+	}
+	q.persistLocked()
+
+	recovered := &taskQueue{
+		statePath: q.statePath,
+		logsDir:   q.logsDir,
+		trigger:   make(chan struct{}, 1),
+	}
+	recovered.loadLocked()
+
+	if len(recovered.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(recovered.tasks))
+	}
+
+	byID := map[string]*Task{}
+	for i := range recovered.tasks {
+		byID[recovered.tasks[i].ID] = recovered.tasks[i]
+	}
+
+	// Pending task must be re-queued as pending with params intact.
+	p := byID["task-pending"]
+	if p == nil {
+		t.Fatal("task-pending not found")
+	}
+	if p.Status != TaskPending {
+		t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
+	}
+	if p.StartedAt != nil {
+		t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
+	}
+	if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
+		t.Fatalf("pending task: params=%+v", p.params)
+	}
+	if p.LogPath == "" {
+		t.Fatal("pending task: expected log path")
+	}
+
+	// Running task must be marked failed, not re-queued, to prevent
+	// launching duplicate workers (e.g. a second set of gpu-burn-workers).
+	r := byID["task-running"]
+	if r == nil {
+		t.Fatal("task-running not found")
+	}
+	if r.Status != TaskFailed {
+		t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
+	}
+	if r.ErrMsg == "" {
+		t.Fatal("running task: expected non-empty error message")
+	}
+	if r.DoneAt == nil {
+		t.Fatal("running task: expected done_at to be set")
+	}
+}
+
+func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "task.log")
+	if err := os.WriteFile(path, []byte("line1\nline2\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	j := newTaskJobState(path)
+	existing, ch := j.subscribe()
+	if ch == nil {
+		t.Fatal("expected live subscription channel")
+	}
+	if len(existing) != 2 || existing[0] != "line1" || existing[1] != "line2" {
+		t.Fatalf("existing=%v", existing)
+	}
+}
+
+func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
+	now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
+	q := &taskQueue{
+		tasks: []*Task{
+			{
+				ID:        "old-running",
+				Name:      "Old Running",
+				Status:    TaskRunning,
+				Priority:  10,
+				CreatedAt: now.Add(-3 * time.Minute),
+			},
+			{
+				ID:        "new-done",
+				Name:      "New Done",
+				Status:    TaskDone,
+				Priority:  0,
+				CreatedAt: now.Add(-1 * time.Minute),
+			},
+			{
+				ID:        "mid-pending",
+				Name:      "Mid Pending",
+				Status:    TaskPending,
+				Priority:  1,
+				CreatedAt: now.Add(-2 * time.Minute),
+			},
+		},
+	}
+
+	got := q.snapshot()
+	if len(got) != 3 {
+		t.Fatalf("snapshot len=%d want 3", len(got))
+	}
+	if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
+		t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
+	}
+}
+
+func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	origCounter := jobCounter.Load()
+	jobCounter.Store(0)
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+		jobCounter.Store(origCounter)
+	})
+
+	if got := newJobID("ignored"); got != "TASK-000" {
+		t.Fatalf("id=%q want TASK-000", got)
+	}
+	if got := newJobID("ignored"); got != "TASK-001" {
+		t.Fatalf("id=%q want TASK-001", got)
+	}
+}
+
+func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
+	root := t.TempDir()
+	task := &Task{
+		ID:   "TASK-007",
+		Name: "NVIDIA Benchmark",
+	}
+	got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
+	if !strings.HasPrefix(got, "007_") {
+		t.Fatalf("artifacts dir=%q want prefix 007_", got)
+	}
+}
+
+func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
+	dir := t.TempDir()
+	logPath := filepath.Join(dir, "task.log")
+	if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "done-1",
+		Name:      "Done Task",
+		Status:    TaskDone,
+		CreatedAt: time.Now(),
+		LogPath:   logPath,
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
+	req.SetPathValue("id", "done-1")
+	rec := httptest.NewRecorder()
+
+	h := &handler{}
+	h.handleAPITasksStream(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
+		t.Fatalf("body=%q", body)
+	}
+	if !strings.Contains(body, "event: done\n") {
+		t.Fatalf("missing done event: %q", body)
+	}
+}
+
+func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "pending-1",
+		Name:      "Pending Task",
+		Status:    TaskPending,
+		CreatedAt: time.Now(),
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
+	req.SetPathValue("id", "pending-1")
+	rec := httptest.NewRecorder()
+
+	done := make(chan struct{})
+	go func() {
+		h := &handler{}
+		h.handleAPITasksStream(rec, req)
+		close(done)
+	}()
+
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
+			cancel()
+			<-done
+			if rec.Code != http.StatusOK {
+				t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+			}
+			return
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	cancel()
+	<-done
+	t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
+}
+
+func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	db, err := openMetricsDB(metricsPath)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	base := time.Now().UTC().Add(-45 * time.Second)
+	if err := db.Write(platform.LiveMetricSample{
+		Timestamp:  base,
+		CPULoadPct: 42,
+		MemLoadPct: 35,
+		PowerW:     510,
+	}); err != nil {
+		t.Fatalf("Write: %v", err)
+	}
+	_ = db.Close()
+
+	q := &taskQueue{
+		statePath: filepath.Join(dir, "tasks-state.json"),
+		logsDir:   filepath.Join(dir, "tasks"),
+		trigger:   make(chan struct{}, 1),
+	}
+	if err := os.MkdirAll(q.logsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	started := time.Now().UTC().Add(-90 * time.Second)
+	task := &Task{
+		ID:        "task-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: started.Add(-10 * time.Second),
+		StartedAt: &started,
+	}
+	q.assignTaskLogPathLocked(task)
+	appendJobLog(task.LogPath, "line-1")
+
+	job := newTaskJobState(task.LogPath)
+	job.finish("")
+	q.finalizeTaskRun(task, job)
+
+	if task.Status != TaskDone {
+		t.Fatalf("status=%q want %q", task.Status, TaskDone)
+	}
+	if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
+		t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
+	}
+	if _, err := os.Stat(task.ReportJSONPath); err != nil {
+		t.Fatalf("report json: %v", err)
+	}
+	if _, err := os.Stat(task.ReportHTMLPath); err != nil {
+		t.Fatalf("report html: %v", err)
+	}
+	var report taskReport
+	data, err := os.ReadFile(task.ReportJSONPath)
+	if err != nil {
+		t.Fatalf("ReadFile(report.json): %v", err)
+	}
+	if err := json.Unmarshal(data, &report); err != nil {
+		t.Fatalf("Unmarshal(report.json): %v", err)
+	}
+	if report.ID != task.ID || report.Status != TaskDone {
+		t.Fatalf("report=%+v", report)
+	}
+	if len(report.Charts) == 0 {
+		t.Fatalf("expected charts in report, got none")
+	}
+}
+
+func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	result := platform.NvidiaBenchmarkResult{
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
+		BenchmarkProfile: "standard",
+		OverallStatus:    "OK",
+		GPUs: []platform.BenchmarkGPUResult{
+			{
+				Index: 0,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1176.25,
+				},
+			},
+		},
+	}
+	raw, err := json.Marshal(result)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
+	if err := os.MkdirAll(artifactsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	task := &Task{
+		ID:           "task-bench",
+		Name:         "NVIDIA Benchmark",
+		Target:       "nvidia-benchmark",
+		Status:       TaskDone,
+		CreatedAt:    time.Now().UTC().Add(-time.Minute),
+		ArtifactsDir: artifactsDir,
+	}
+	ensureTaskReportPaths(task)
+	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
+	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := writeTaskReportArtifacts(task); err != nil {
+		t.Fatalf("writeTaskReportArtifacts: %v", err)
+	}
+
+	body, err := os.ReadFile(task.ReportHTMLPath)
+	if err != nil {
+		t.Fatalf("ReadFile(report.html): %v", err)
+	}
+	html := string(body)
+	for _, needle := range []string{
+		`Benchmark Results`,
+		`Composite score for this benchmark task.`,
+		`GPU 0`,
+		`1176.25`,
+	} {
+		if !strings.Contains(html, needle) {
+			t.Fatalf("report missing %q: %s", needle, html)
+		}
+	}
+}
+
+func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
+	var lines []string
+	prev := taskSerialWriteLine
+	taskSerialWriteLine = func(line string) { lines = append(lines, line) }
+	t.Cleanup(func() { taskSerialWriteLine = prev })
+
+	dir := t.TempDir()
+	q := &taskQueue{
+		statePath: filepath.Join(dir, "tasks-state.json"),
+		logsDir:   filepath.Join(dir, "tasks"),
+		trigger:   make(chan struct{}, 1),
+	}
+	task := &Task{
+		ID:        "task-serial-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskPending,
+		CreatedAt: time.Now().UTC(),
+	}
+
+	q.enqueue(task)
+	started := time.Now().UTC()
+	task.Status = TaskRunning
+	task.StartedAt = &started
+	job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
+	job.append("Starting CPU SAT...")
+	job.append("CPU stress duration: 60s")
+	job.finish("")
+	q.finalizeTaskRun(task, job)
+
+	joined := strings.Join(lines, "\n")
+	for _, needle := range []string{
+		"queued",
+		"Starting CPU SAT...",
+		"CPU stress duration: 60s",
+		"finished with status=done",
+	} {
+		if !strings.Contains(joined, needle) {
+			t.Fatalf("serial mirror missing %q in %q", needle, joined)
+		}
+	}
+}
+
+func TestResolveBurnPreset(t *testing.T) {
+	tests := []struct {
+		profile string
+		want    burnPreset
+	}{
+		{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
+		{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
+		{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
+		{profile: "", want: burnPreset{DurationSec: 5 * 60}},
+	}
+	for _, tc := range tests {
+		if got := resolveBurnPreset(tc.profile); got != tc.want {
+			t.Fatalf("resolveBurnPreset(%q)=%+v want %+v", tc.profile, got, tc.want)
+		}
+	}
+}
+
+func TestResolveNvidiaRampPlan(t *testing.T) {
+	tests := []struct {
+		name     string
+		profile  string
+		enabled  bool
+		selected []int
+		want     nvidiaRampSpec
+		wantErr  string
+	}{
+		{
+			name:     "disabled uses base preset",
+			profile:  "acceptance",
+			selected: []int{0, 1},
+			want:     nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
+		},
+		{
+			name:     "smoke ramp uses two minute steps",
+			profile:  "smoke",
+			enabled:  true,
+			selected: []int{0, 1, 2},
+			want:     nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
+		},
+		{
+			name:     "acceptance ramp uses ten minute steps",
+			profile:  "acceptance",
+			enabled:  true,
+			selected: []int{0, 1, 2},
+			want:     nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
+		},
+		{
+			name:     "overnight stays at eight hours when possible",
+			profile:  "overnight",
+			enabled:  true,
+			selected: []int{0, 1, 2},
+			want:     nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
+		},
+		{
+			name:     "overnight extends to keep one hour after final gpu",
+			profile:  "overnight",
+			enabled:  true,
+			selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
+			want:     nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
+		},
+		{
+			name:     "overnight rejects impossible gpu count",
+			profile:  "overnight",
+			enabled:  true,
+			selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+			wantErr:  "at most 10 GPUs",
+		},
+		{
+			name:    "enabled requires explicit selection",
+			profile: "smoke",
+			enabled: true,
+			wantErr: "requires explicit GPU selection",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
+			if tc.wantErr != "" {
+				if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
+					t.Fatalf("err=%v want substring %q", err, tc.wantErr)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("resolveNvidiaRampPlan error: %v", err)
+			}
+			if got != tc.want {
+				t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
+	tests := []struct {
+		loader string
+		want   string
+	}{
+		{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
+		{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
+		{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
+		{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
+	}
+	for _, tc := range tests {
+		if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
+			t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
+		}
+	}
+}
+
+func TestRunTaskHonorsCancel(t *testing.T) {
+	blocked := make(chan struct{})
+	released := make(chan struct{})
+	aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
+		close(blocked)
+		select {
+		case <-ctx.Done():
+			close(released)
+			return "", ctx.Err()
+		case <-time.After(5 * time.Second):
+			close(released)
+			return "unexpected", nil
+		}
+	}
+
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "cpu-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{Duration: 60},
+	}
+	j := &jobState{}
+	ctx, cancel := context.WithCancel(context.Background())
+	j.cancel = cancel
+	tk.job = j
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
+		return aRun(nil, ctx, baseDir, durationSec, logFunc)
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	done := make(chan struct{})
+	go func() {
+		q.runTask(tk, j, ctx)
+		close(done)
+	}()
+
+	<-blocked
+	j.abort()
+
+	select {
+	case <-released:
+	case <-time.After(2 * time.Second):
+		t.Fatal("task did not observe cancel")
+	}
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("runTask did not return after cancel")
+	}
+}
+
+func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
+	var gotDuration int
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "cpu-burn-1",
+		Name:      "CPU Burn-in",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{BurnProfile: "smoke"},
+	}
+	j := &jobState{}
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
+		gotDuration = durationSec
+		return "/tmp/cpu-burn.tar.gz", nil
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotDuration != 5*60 {
+		t.Fatalf("duration=%d want %d", gotDuration, 5*60)
+	}
+}
+
+func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
+	dir := t.TempDir()
+	q := &taskQueue{
+		opts: &HandlerOptions{ExportDir: dir},
+	}
+	tk := &Task{
+		ID:        "support-bundle-1",
+		Name:      "Support Bundle",
+		Target:    "support-bundle",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}
+	j := &jobState{}
+
+	var gotExportDir string
+	orig := buildSupportBundle
+	buildSupportBundle = func(exportDir string) (string, error) {
+		gotExportDir = exportDir
+		return filepath.Join(exportDir, "bundle.tar.gz"), nil
+	}
+	defer func() { buildSupportBundle = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotExportDir != dir {
+		t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
+	}
+	if j.err != "" {
+		t.Fatalf("unexpected error: %q", j.err)
+	}
+	if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
+		t.Fatalf("lines=%v", j.lines)
+	}
+}
+
+func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
+	now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
+	created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
+	started := time.Time{}
+	task := &Task{
+		Status:    TaskRunning,
+		CreatedAt: created,
+		StartedAt: &started,
+	}
+	if got := taskElapsedSec(task, now); got != 0 {
+		t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
+	}
+
+	stale := created.Add(-24 * time.Hour)
+	task.StartedAt = &stale
+	if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
+		t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
+	}
+}
+
+func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
+	q := &taskQueue{
+		opts: &HandlerOptions{},
+	}
+	tk := &Task{
+		ID:        "install-1",
+		Name:      "Install to Disk",
+		Target:    "install",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{Device: "/dev/sda"},
+	}
+	j := &jobState{}
+
+	var gotDevice string
+	var gotLogPath string
+	orig := installCommand
+	installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
+		gotDevice = device
+		gotLogPath = logPath
+		return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
+	}
+	defer func() { installCommand = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotDevice != "/dev/sda" {
+		t.Fatalf("device=%q want /dev/sda", gotDevice)
+	}
+	if gotLogPath == "" {
+		t.Fatal("expected install log path")
+	}
+	logs := strings.Join(j.lines, "\n")
+	if !strings.Contains(logs, "Install log: ") {
+		t.Fatalf("missing install log line: %v", j.lines)
+	}
+	if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
+		t.Fatalf("missing streamed output: %v", j.lines)
+	}
+	if j.err != "" {
+		t.Fatalf("unexpected error: %q", j.err)
+	}
+}
+
+func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
+	dir := t.TempDir()
+	q := &taskQueue{
+		opts:        &HandlerOptions{App: &app.App{}},
+		statePath:   filepath.Join(dir, "tasks-state.json"),
+		logsDir:     filepath.Join(dir, "tasks"),
+		kmsgWatcher: newKmsgWatcher(nil),
+	}
+	tk := &Task{
+		ID:        "cpu-panic-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}
+	j := &jobState{}
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
+		panic("boom")
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	q.executeTask(tk, j, context.Background())
+
+	if tk.Status != TaskFailed {
+		t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
+	}
+	if tk.DoneAt == nil {
+		t.Fatal("expected done_at to be set")
+	}
+	if !strings.Contains(tk.ErrMsg, "task panic: boom") {
+		t.Fatalf("task error=%q", tk.ErrMsg)
+	}
+	if !strings.Contains(j.err, "task panic: boom") {
+		t.Fatalf("job error=%q", j.err)
+	}
+	q.kmsgWatcher.mu.Lock()
+	activeCount := q.kmsgWatcher.activeCount
+	window := q.kmsgWatcher.window
+	q.kmsgWatcher.mu.Unlock()
+	if activeCount != 0 {
+		t.Fatalf("activeCount=%d want 0", activeCount)
+	}
+	if window != nil {
+		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
+	}
+}
--- a/audit/scripts/resolve-version.sh
+++ b/audit/scripts/resolve-version.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+set -eu
+
+tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
+
+case "${tag}" in
+	v*)
+		printf '%s\n' "${tag#v}"
+		;;
+	"")
+		printf 'dev\n'
+		;;
+	*)
+		printf '%s\n' "${tag}"
+		;;
+esac
--- a/2
+++ b/2
--- a/bible-local/architecture/charting.md
+++ b/bible-local/architecture/charting.md
@@ -0,0 +1,67 @@
+# Charting architecture
+
+## Decision: one chart engine for all live metrics
+
+**Engine:** `github.com/go-analyze/charts` (pure Go, no CGO, SVG output)
+**Theme:** `grafana` (dark background, coloured lines)
+
+All live metrics charts in the web UI are server-side SVG images served by Go
+and polled by the browser every 2 seconds via `<img src="...?t=now">`.
+There is no client-side canvas or JS chart library.
+
+## Rule: live charts must be visually uniform
+
+Live charts are a single UI family, not a set of one-off widgets. New charts and
+changes to existing charts must keep the same rendering model and presentation
+rules unless there is an explicit architectural decision to diverge.
+
+Default expectations:
+
+- same server-side SVG pipeline for all live metrics charts
+- same refresh behaviour and failure handling in the browser
+- same canvas size class and card layout
+- same legend placement policy across charts
+- same axis, title, and summary conventions
+- no chart-specific visual exceptions added as a quick fix
+
+Current default for live charts:
+
+- legend below the plot area when a chart has 8 series or fewer
+- legend hidden when a chart has more than 8 series
+- 10 equal Y-axis steps across the chart height
+- 1400 x 360 SVG canvas with legend
+- 1400 x 288 SVG canvas without legend
+- full-width card rendering in a single-column stack
+
+If one chart needs a different layout or legend behaviour, treat that as a
+design-level decision affecting the whole chart family, not as a local tweak to
+just one endpoint.
+
+### Why go-analyze/charts
+
+- Pure Go, no CGO — builds cleanly inside the live-build container
+- SVG output — crisp at any display resolution, full-width without pixelation
+- Grafana theme matches the dark web UI colour scheme
+- Active fork of the archived wcharczuk/go-chart
+
+### SAT stress-test charts
+
+The `drawGPUChartSVG` function in `platform/gpu_metrics.go` is a separate
+self-contained SVG renderer used **only** for completed SAT run reports
+(HTML export, burn-in summaries). It is not used for live metrics.
+
+### Live metrics chart endpoints
+
+| Path | Content |
+|------|---------|
+| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
+| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
+
+Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
+the legend is hidden. The page renders them at `width: 100%` in a
+single-column layout so they always fill the viewport width.
+
+### Ring buffers
+
+Each metric is stored in a 120-sample ring buffer (2 minutes of history at 1 Hz).
+Buffers are per-server or per-GPU and grow dynamically as new GPUs appear.
--- a/bible-local/architecture/runtime-flows.md
+++ b/bible-local/architecture/runtime-flows.md
@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.

 ## Boot sequence (single ISO)

+The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
+
 `systemd` boot order:

 ```
@@ -20,11 +22,12 @@ local-fs.target
  │                           creates /dev/nvidia* nodes)
  ├── bee-audit.service      (runs `bee audit` → /var/log/bee-audit.json,
  │                            never blocks boot on partial collector failures)
-  └── bee-web.service        (runs `bee web` on :80,
-                               reads the latest audit snapshot on each request)
+  ├── bee-web.service        (runs `bee web` on :80 — full interactive web UI)
+  └── bee-desktop.service    (startx → openbox + chromium http://localhost/)
 ```

 **Critical invariants:**
+- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
 - OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
 - `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
 - `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
@@ -41,18 +44,24 @@ Local-console behavior:
 ```text
 tty1
  └── live-config autologin → bee
-        └── /home/bee/.profile
-              └── exec menu
-                    └── /usr/local/bin/bee-tui
-                          └── sudo -n /usr/local/bin/bee tui --runtime livecd
+        └── /home/bee/.profile (prints web UI URLs)
+
+display :0
+  └── bee-desktop.service (User=bee)
+        └── startx /usr/local/bin/bee-openbox-session -- :0
+              ├── tint2 (taskbar)
+              ├── chromium http://localhost/
+              └── openbox (WM)
 ```

 Rules:
 - local `tty1` lands in user `bee`, not directly in `root`
- `menu` must work without typing `sudo`
- TUI actions still run as `root` via `sudo -n`
- SSH is independent from the tty1 path
+- `bee-desktop.service` starts X11 + openbox + Chromium automatically after `bee-web.service`
+- Chromium opens `http://localhost/` — the full interactive web UI
+- SSH is independent from the desktop path
 - serial console support is enabled for VM boot debugging
+- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
+- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`

 ## ISO build sequence

@@ -71,24 +80,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
       d. build kernel modules against Debian headers
       e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
       f. cache in `dist/nvidia-<version>-<kver>/`
-  7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
-  8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
-  9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/`
-  10. write staged `/etc/bee-release` (versions + git commit)
-  11. patch staged `motd` with build metadata
-  12. copy `iso/builder/` into a temporary live-build workdir under `dist/`
-  13. sync staged overlay into workdir `config/includes.chroot/`
-  14. run `lb config && lb build` inside the privileged builder container
+  7. `build-cublas.sh`:
+       a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
+       b. verify packages against repo `Packages.gz`
+       c. extract headers for `bee-gpu-burn` worker build
+       d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
+  8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
+  9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
+  10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
+  11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
+  12. write staged `/etc/bee-release` (versions + git commit)
+  13. patch staged `motd` with build metadata
+  14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
+  15. sync staged overlay into workdir `config/includes.chroot/`
+  16. run `lb config && lb build` inside the privileged builder container
 ```

+Build host notes:
+- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
+- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
+- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
+
 **Critical invariants:**
 - `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
  1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
  2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
 - NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
+- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
+- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
 - The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
 - The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
 - Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
+- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
+- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.

 ## Post-boot smoke test

@@ -104,7 +128,7 @@ Key checks: NVIDIA modules loaded, `nvidia-smi` sees all GPUs, lib symlinks pres
 systemd services running, audit completed with NVIDIA enrichment, LAN reachability.

 Current validation state:
- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and TUI startup
+- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and Web UI startup
 - real hardware validation is still required before treating the ISO as release-ready

 ## Overlay mechanism
@@ -131,43 +155,31 @@ Current validation state:
 Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.

 Acceptance flows:
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
+- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
+- NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
 - `bee sat memory` → `memtester` archive
 - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
 - SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
+- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
+  - Ampere: `fp16` + `fp32`/TF32 tensor-core load
+  - Ada / Hopper: add `fp8`
+  - Blackwell+: add `fp4`
+  - PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
 - Runtime overrides:
-  - `BEE_GPU_STRESS_SECONDS`
-  - `BEE_GPU_STRESS_SIZE_MB`
  - `BEE_MEMTESTER_SIZE_MB`
  - `BEE_MEMTESTER_PASSES`

-## NVIDIA SAT TUI flow (v1.0.0+)
+## NVIDIA SAT Web UI flow

 ```
-TUI: Acceptance tests → NVIDIA command pack
-  1. screenNvidiaSATSetup
-       a. enumerate GPUs via `nvidia-smi --query-gpu=index,name,memory.total`
-       b. user selects duration preset: 10 min / 1 h / 8 h / 24 h
-       c. user selects GPUs via checkboxes (all selected by default)
-       d. memory size = max(selected GPU memory) — auto-detected, not exposed to user
-  2. Start → screenNvidiaSATRunning
-       a. CUDA_VISIBLE_DEVICES set to selected GPU indices
-       b. tea.Batch: SAT goroutine + tea.ExecProcess(nvtop) launched concurrently
-       c. nvtop occupies full terminal; SAT result queues in background
-       d. [o] reopen nvtop at any time; [a] abort (cancels context → kills bee-gpu-stress)
-  3. GPU metrics collection (during bee-gpu-stress)
-       - background goroutine polls `nvidia-smi` every second
-       - per-second rows: elapsed, GPU index, temp°C, usage%, power W, clock MHz
-       - outputs: gpu-metrics.csv, gpu-metrics.html (offline SVG chart), gpu-metrics-term.txt
-  4. After SAT completes
-       - result shown in screenOutput with terminal line-chart (gpu-metrics-term.txt)
-       - chart is asciigraph-style: box-drawing chars (╭╮╰╯─│), 4 series per GPU,
-         Y axis with ticks, ANSI colours (red=temp, blue=usage, green=power, yellow=clock)
+Web UI: Acceptance Tests page → Run Test button
+  1. POST /api/sat/nvidia/run → returns job_id
+  2. GET  /api/sat/stream?job_id=... (SSE) — streams stdout/stderr lines live
+  3. After completion — archive written to /appdata/bee/export/bee-sat/
+     summary.txt contains overall_status (OK / FAILED) and per-job status values
 ```

 **Critical invariants:**
- `nvtop` must be in `iso/builder/config/package-lists/bee.list.chroot` (baked into ISO).
- `bee-gpu-stress` uses `exec.CommandContext` — aborted on cancel.
+- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
 - Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
- If `nvtop` is not found on PATH, SAT still runs without it (graceful degradation).
 - SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
--- a/bible-local/architecture/system-overview.md
+++ b/bible-local/architecture/system-overview.md
@@ -21,13 +21,14 @@ Fills gaps where Redfish/logpile is blind:
 - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
 - Machine-readable health summary derived from collector verdicts
 - Operator-triggered acceptance tests for NVIDIA, memory, and storage
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
+- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
+- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
 - Automatic boot audit with operator-facing local console and SSH access
 - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
 - SSH access (OpenSSH) always available for inspection and debugging
- Interactive Go TUI via `bee tui` for network setup, service management, and acceptance tests
- Read-only web viewer via `bee web`, rendering the latest audit snapshot through the embedded Reanimator Chart
- Local `tty1` operator UX: `bee` autologin, `menu` auto-start, privileged actions via `sudo -n`
+- Full web UI via `bee web` on port 80: interactive control panel with live metrics, SAT tests, network config, service management, export, and tools
+- Local operator desktop: openbox + Xorg + Chromium auto-opening `http://localhost/`
+- Local `tty1` operator UX: `bee` autologin, openbox desktop auto-starts with Chromium on `http://localhost/`

 ## Network isolation — CRITICAL

@@ -69,15 +70,18 @@ Fills gaps where Redfish/logpile is blind:
 | SSH | OpenSSH server |
 | NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
 | NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
+| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
 | Builder | Debian 12 host/VM or Debian 12 container image |

 ## Operator UX

 - On the live ISO, `tty1` autologins as `bee`
- The login profile auto-runs `menu`, which enters the Go TUI
- The TUI itself executes privileged actions as `root` via `sudo -n`
+- `bee-desktop.service` starts X11 + openbox + Chromium on display `:0`
+- Chromium opens `http://localhost/` — the full web UI
 - SSH remains available independently of the local console path
+- Remote operators can open `http://<ip>/` in any browser on the same LAN
 - VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
+- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries

 ## Runtime split

@@ -85,6 +89,7 @@ Fills gaps where Redfish/logpile is blind:
 - Live-ISO-only responsibilities stay in `iso/` integration code
 - Live ISO launches the Go CLI with `--runtime livecd`
 - Local/manual runs use `--runtime auto` or `--runtime local`
+- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup

 ## Key paths

@@ -99,7 +104,10 @@ Fills gaps where Redfish/logpile is blind:
 | `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
 | `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
 | `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
-| `iso/overlay/etc/profile.d/bee.sh` | `menu` helper + tty1 auto-start policy |
-| `iso/overlay/home/bee/.profile` | `bee` shell profile for local console startup |
+| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
+| `iso/overlay/home/bee/.profile` | `bee` shell profile (PATH only) |
+| `iso/overlay/etc/systemd/system/bee-desktop.service` | starts X11 + openbox + chromium |
+| `iso/overlay/usr/local/bin/bee-desktop` | startx wrapper for bee-desktop.service |
+| `iso/overlay/usr/local/bin/bee-openbox-session` | xinitrc: tint2 + chromium + openbox |
 | `dist/` | Build outputs (gitignored) |
 | `iso/out/` | Downloaded ISO files (gitignored) |
--- a/bible-local/decisions/2026-03-05-nvidia-proprietary-driver.md
+++ b/bible-local/decisions/2026-03-05-nvidia-proprietary-driver.md
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
 - Kernel modules and nvidia-smi come from a single verified source.
 - NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
 - Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
+- DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
+- For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
 - Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
 - Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
 - ISO size increases by ~50MB for .ko files + nvidia-smi.
--- a/bible-local/decisions/2026-04-01-memtest-build-strategy.md
+++ b/bible-local/decisions/2026-04-01-memtest-build-strategy.md
@@ -0,0 +1,224 @@
+# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
+
+**Date:** 2026-04-01
+**Status:** resolved
+
+## Context
+
+We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
+The commit history shows several distinct attempts:
+
+- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
+- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
+- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
+- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
+- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
+- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
+
+Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
+
+- `lb binary_memtest` does run and installs `memtest86+`
+- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
+- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
+
+So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
+
+Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
+
+- the build now completes successfully because memtest is non-blocking by default
+- `lb binary_memtest` still runs and installs `memtest86+`
+- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
+- but it executes too early for its current target paths:
+  - `binary/boot/grub/grub.cfg` is still missing at hook time
+  - `binary/isolinux/live.cfg` is still missing at hook time
+  - memtest binaries are also still absent in `binary/boot/`
+- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
+- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
+
+So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
+
+Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
+artifact dated 2026-04-01:
+
+- the final ISO does contain `boot/memtest86+x64.bin`
+- the final ISO does contain `boot/memtest86+x64.efi`
+- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
+  and `isolinux/live.cfg`
+- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
+  shipped ISO
+- the regression was in the build-time validator/debug path in `build.sh`
+
+Root cause of the false alarm:
+
+- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
+  successfully listed/extracted members"
+- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
+  observable output as "memtest content missing"
+- this made a reader failure look identical to a missing memtest payload
+- as a result, we re-entered the same memtest investigation loop even though
+  the real ISO was already correct
+
+Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
+
+- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
+  still carried live-build's default memtest layout (`live/memtest.bin`,
+  `live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
+- that mismatch is expected to trigger project recovery, because `bee` requires
+  `boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
+- however, `build.sh` exited before recovery because `set -e` treated a direct
+  `iso_memtest_present` return code of `1` as fatal
+- so the next repeated loop was caused by shell control flow, not by proof that
+  the recovery design itself was wrong
+
+## Known Failed Attempts
+
+These approaches were already tried and should not be repeated blindly:
+
+1. Built-in live-build memtest only.
+Reason it failed:
+- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
+
+2. Fixing only the memtest file names for Debian Bookworm.
+Reason it failed:
+- correct file names alone do not make the files appear in the final ISO.
+
+3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
+Reason it failed:
+- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
+
+4. Fallback extraction from cached `memtest86+` `.deb`.
+Reason it failed:
+- this was explored already and was not enough to stabilize the final ISO path end-to-end.
+
+5. Restoring explicit memtest menu entries in source bootloader templates only.
+Reason it failed:
+- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
+
+6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
+Reason it failed:
+- the hook runs before those files exist, so the hook cannot patch them there.
+
+## What This Means
+
+When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
+
+- do not assume the built-in memtest stage is sufficient
+- do not assume `chroot/boot/` will contain memtest payloads
+- do not assume source bootloader templates are the last writer of final ISO configs
+- do not assume the current normal binary hook timing is late enough for final patching
+
+Any future memtest fix must explicitly identify:
+
+- where the memtest binaries are reliably available at build time
+- which exact build stage writes the final bootloader configs that land in the ISO
+- and a post-build proof from a real ISO, not only from intermediate workdir files
+- whether the ISO inspection step itself succeeded, rather than merely whether
+  the validator printed a memtest warning
+- whether a non-zero probe is intentionally handled inside an `if` / `case`
+  context rather than accidentally tripping `set -e`
+
+## Decision
+
+For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
+
+Project rules from now on:
+
+- Do **not** trust `--memtest memtest86+` by itself.
+- A memtest implementation is considered valid only if the produced ISO actually contains:
+  - `boot/memtest86+x64.bin`
+  - `boot/memtest86+x64.efi`
+  - a GRUB menu entry
+  - an isolinux menu entry
+- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
+  - a binary hook copying files into `binary/boot/`
+  - extraction from the cached `memtest86+` `.deb`
+  - another deterministic build-time copy step
+- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
+
+Current implementation direction:
+
+- keep the live-build memtest stage enabled if it helps package acquisition
+- do not rely on the current early `binary_hooks` timing for final patching
+- prefer a post-`lb build` recovery step in `build.sh` that:
+  - patches the fully materialized `LB_DIR/binary` tree
+  - injects memtest binaries there
+  - ensures final bootloader entries there
+  - reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
+- also treat ISO validation tooling as part of the critical path:
+  - install a stable ISO reader in the builder image
+  - fail with an explicit reader error if ISO listing/extraction fails
+  - do not treat reader failure as evidence that memtest is missing
+  - do not call a probe that may return "needs recovery" as a bare command under
+    `set -e`; wrap it in explicit control flow
+
+## Consequences
+
+- Future memtest changes must begin by reading this ADR and the commits listed above.
+- Future memtest changes must also begin by reading the failed-attempt list above.
+- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
+- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
+- But validation output is only trustworthy if ISO reading itself succeeded. A
+  "missing memtest" warning without a successful ISO read is not evidence.
+- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
+
+## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
+
+This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
+and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
+
+### Components
+
+**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
+
+Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
+those files may not exist yet. Instead:
+
+- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
+- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
+- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
+  If they do not exist, the hook warns and continues (does not fail).
+
+Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
+
+**2. Post-`lb build` recovery step in `build.sh`**
+
+After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
+contains all required memtest artifacts. If not:
+
+- Copies/extracts memtest binaries into `binary/boot/`.
+- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
+- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
+  the ISO with the patched tree.
+
+This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
+step handles the final `binary/` tree after live-build has written all bootloader configs.
+
+**3. ISO validation hardening**
+
+The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
+as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
+handled — it does not abort the build prematurely.
+
+ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
+If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
+This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
+
+### Why this works when earlier attempts did not
+
+The earlier patterns all shared a single flaw: they assumed a single build-time point
+(hook or source template) would be the last writer of bootloader configs and memtest payloads.
+In live-build on Debian Bookworm that assumption is false — live-build continues writing
+bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
+
+The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
+`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
+There is no ordering dependency to get wrong.
+
+### Do not revert
+
+Do not remove the recovery step or the hook without a fresh real ISO build proving
+live-build alone produces all four required artifacts:
+- `boot/memtest86+x64.bin`
+- `boot/memtest86+x64.efi`
+- memtest entry in `boot/grub/grub.cfg`
+- memtest entry in `isolinux/live.cfg`
--- a/bible-local/decisions/README.md
+++ b/bible-local/decisions/README.md
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
 | Date | Decision | Status |
 |---|---|---|
 | 2026-03-05 | Use NVIDIA proprietary driver | active |
+| 2026-04-01 | Treat memtest as explicit ISO content | active |
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -0,0 +1,248 @@
+# Benchmark clock calibration research
+
+## Status
+In progress. Baseline data from production servers pending.
+
+## Background
+
+The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
+before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
+`avg_steady_clock < locked_target * 0.90`.
+
+Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
+even a healthy GPU in a non-ideal server will sustain clocks well below boost.
+The 90% threshold has no empirical basis.
+
+## Key observations (2026-04-06)
+
+### H100 PCIe — new card, server not designed for it
+- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
+- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
+- Stability: 70.0 — clocks erratic, no equilibrium found
+- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
+
+### H200 NVL — new card, server not designed for it
+- avg clock = P95 = 1635 MHz (perfectly stable)
+- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
+- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
+- Degradation: power_capped, thermal_limited
+- Compute: 989 TOPS — card is computing correctly for its frequency
+
+### Key insight
+The meaningful distinction is not *whether* the card throttles but *how stably*
+it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
+H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
+instability may reflect a more severe thermal mismatch or a card issue.
+
+`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
+`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
+
+## Hypothesis for baseline
+
+After testing on servers designed for their GPUs (proper cooling):
+- Healthy GPU under sustained load will run at a stable fraction of boost
+- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
+- Base clock (`clocks.base.gr`) may be a better reference than boost:
+  a healthy card under real workload should comfortably exceed base clock
+
+## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
+
+Source: external stress test tool, ~90s runs, designed server, adequate power.
+
+### Healthy fingerprint
+
+- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
+- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
+  - Avg steady (visual): **~1580–1620 MHz**
+  - vs boost 1755 MHz: **~91–92%**
+  - Oscillation is NORMAL — this is the boost algorithm balancing under power cap
+  - Stable power + oscillating clocks = healthy power-cap behavior
+- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
+- **Consistency**: all 10 samples within ±20 MHz — very repeatable
+
+### Characteristic patten
+Flat power line + oscillating/declining clock line = GPU correctly managed by
+power cap algorithm. Do NOT flag this as instability.
+
+### Clock CV implication
+The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
+The current `variance_too_high` threshold (StabilityScore < 85) may fire on
+healthy HBM2e PCIe cards. Needs recalibration.
+
+---
+
+## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
+
+Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
+Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
+
+### GPU clock reference (from nvidia-smi, idle):
+- base_clock_mhz: **1095**
+- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
+- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
+- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
+
+### Observed under 700W sustained load (both samples nearly identical):
+- Power: ~700W flat — SXM slot, adequate power confirmed
+- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
+- vs 1980 MHz (lock target): **72–74%** — severely below
+- vs 1755 MHz (nvidia-smi boost): **81–83%**
+- vs 1095 MHz (base): 130% — above base but far below expected for SXM
+- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
+- Temperature: 38°C → 79–80°C (same rate as HBM2e)
+- Oscillation: present, similar character to HBM2e but at much lower frequency
+
+### Diagnosis
+These restored cards are degraded. A healthy H100 SXM in a designed server
+(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
+The 72–74% result is a clear signal of silicon or VRM degradation from the
+refurbishment process.
+
+### Clock pattern note
+Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
+to images 19/20. Both sample sets show same degraded pattern — same batch.
+
+---
+
+## Baseline matrix (filled where data available)
+
+| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
+|---|---|---|---|---|---|
+| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
+| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
+| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
+| H200 NVL | designed | TBD | TBD | TBD | need baseline |
+
+---
+
+## H100 official spec (from NVIDIA datasheet)
+
+Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
+All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
+
+| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
+|---|---|---|---|---|---|
+| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
+| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
+| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
+| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
+
+Notes:
+- SXM boards do NOT list FP8 peak in this table (field empty)
+- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
+- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
+
+## Observed efficiency (H100 80GB PCIe, throttled server)
+
+From the report in this session (power+thermal throttle throughout steady):
+
+| Precision | Measured | Spec (dense) | % of spec |
+|---|---|---|---|
+| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
+| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
+| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
+
+33–44% of spec is expected given sustained power+thermal throttle (avg clock
+1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
+actual frequency — the low TOPS comes from throttle, not silicon defect.
+
+## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
+
+Format: without sparsity / with sparsity.
+
+| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
+|---|---|---|---|---|---|
+| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
+| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
+
+## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
+
+Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
+
+| Precision | Measured | Spec (dense) | % of spec |
+|---|---|---|---|
+| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
+| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
+| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
+
+Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
+both are throttle-limited. Confirms that % of spec is not a quality signal,
+it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
+
+## Real-world GEMM efficiency reference (2026-04-06, web research)
+
+Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
+worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
+
+### What healthy systems actually achieve:
+- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
+- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
+- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
+
+### Our results vs expectation:
+| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
+|---|---|---|---|---|
+| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
+| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
+
+Our results are roughly **half** of what a healthy system achieves even under throttle.
+This is NOT normal — 30-44% is not the industry baseline.
+
+### Likely causes of the gap (in order of probability):
+1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
+2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
+   Previous user may have set a lower limit via nvidia-smi -pl and it was not
+   reset. Our normalization sets clock locks but does NOT reset power limit.
+   Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
+3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
+   8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
+
+### Power limit gap analysis (H100 PCIe):
+- Avg clock 1384 MHz = 79% of boost 1755 MHz
+- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
+- Actually measured: 329 TOPS = 55% of that estimate
+- Remaining gap after accounting for clock throttle: ~45%
+- Most likely explanation: enforced power limit < 350W TDP, further reducing
+  sustainable clock beyond what sw_thermal alone would cause.
+
+### Action item:
+Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
+so result.json shows if the card was pre-configured with a non-default limit.
+If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
+
+### CPU/RAM impact on GPU FLOPS:
+None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
+CPU core count and host RAM are irrelevant.
+
+## Compute efficiency metric (proposed, no hardcode)
+
+Instead of comparing TOPS to a hardcoded spec, compute:
+  tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
+
+This is model-agnostic. A GPU computing correctly at its actual frequency
+will show a consistent tops_per_sm_per_ghz regardless of throttle level.
+A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
+normal clocks.
+
+SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
+(needs to be added to queryBenchmarkGPUInfo).
+
+Reference values to establish after baseline runs:
+- H100 PCIe fp16_tensor: TBD tops/SM/GHz
+- H100 SXM fp16_tensor: TBD tops/SM/GHz
+
+## Proposed threshold changes (pending more data)
+
+1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
+   91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
+   capture the root cause.
+
+2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
+   under power cap. Consider suppressing this flag when power is flat and usage
+   is 100% (oscillation is expected). Or lower threshold to 70.
+
+3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
+   ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
+   would have been caught by this).
+
+Decision deferred until baseline on SXM designed servers collected.
--- a/bible-local/docs/gpu-model-propagation.md
+++ b/bible-local/docs/gpu-model-propagation.md
@@ -0,0 +1,117 @@
+# GPU Model Name Propagation
+
+How GPU model names are detected, stored, and displayed throughout the project.
+
+---
+
+## Detection Sources
+
+There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
+
+### Pipeline A — Live / SAT (nvidia-smi query at runtime)
+
+**File:** `audit/internal/platform/sat.go`
+
+- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
+- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
+- Used by: GPU selection UI, live metrics labels, burn/stress test logic
+
+### Pipeline B — Benchmark results
+
+**File:** `audit/internal/platform/benchmark.go`, line 124
+
+- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
+- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
+- Used by: benchmark history table, benchmark report
+
+### Pipeline C — Hardware audit JSON (PCIe schema)
+
+**File:** `audit/internal/schema/hardware.go`
+
+- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
+- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
+- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
+- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
+
+---
+
+## Key Inconsistency: NVIDIA PCIe Model is Never Set
+
+`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
+
+This means:
+- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
+- AMD GPUs do have their model populated
+
+The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
+
+---
+
+## Benchmark History "Unknown GPU" Issue
+
+**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
+
+**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
+
+This happens for:
+- Older result files saved before the `Name` field was added
+- Runs where nvidia-smi query failed before the benchmark started
+
+---
+
+## Fallback Strings — Current State
+
+| Location | File | Fallback string |
+|---|---|---|
+| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
+| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
+| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
+| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
+| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
+| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
+| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
+| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
+
+**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
+
+---
+
+## GPU Selection UI
+
+**File:** `audit/internal/webui/pages.go`
+
+- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
+- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
+- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
+
+This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
+
+---
+
+## Data Flow Summary
+
+```
+nvidia-smi (live)
+  └─ ListNvidiaGPUs() → NvidiaGPU.Name
+       ├─ GPU selection UI (always correct)
+       ├─ Live metrics labels (charts_svg.go)
+       └─ SAT/burn status file (sat.go)
+
+nvidia-smi (at benchmark start)
+  └─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
+       └─ BenchmarkGPUResult.Name (json:"name,omitempty")
+            ├─ Benchmark report
+            └─ Benchmark history table columns
+
+nvidia-smi / lspci (audit collection)
+  └─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
+       └─ Hardware summary page hwDescribeGPU()
+```
+
+---
+
+## What Needs Fixing
+
+1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
+2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
+3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
--- a/bible-local/docs/iso-build-rules.md
+++ b/bible-local/docs/iso-build-rules.md
@@ -0,0 +1,62 @@
+# ISO Build Rules
+
+## Verify package names before use
+
+ISO builds take 30–60 minutes. A wrong package name wastes an entire build cycle.
+
+**Rule: before adding any Debian package name to the ISO config, verify it exists and check its file list.**
+
+Use one of:
+- `https://packages.debian.org/bookworm/<package-name>` — existence + description
+- `https://packages.debian.org/bookworm/amd64/<package-name>/filelist` — exact files installed
+- `apt-cache show <package>` inside a Debian bookworm container
+
+This applies to:
+- `iso/builder/config/package-lists/*.list.chroot`
+- Any package referenced in bootloader configs, hooks, or overlay scripts
+
+## Memtest rule
+
+Do not assume live-build's built-in memtest integration is sufficient for `bee`.
+We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
+ran, but the final ISO still lacked memtest binaries and menu entries.
+
+For this project, memtest is accepted only when the produced ISO actually
+contains all of the following:
+
+- `boot/memtest86+x64.bin`
+- `boot/memtest86+x64.efi`
+- a memtest entry in `boot/grub/grub.cfg`
+- a memtest entry in `isolinux/live.cfg`
+
+Rules:
+
+- Keep explicit post-build memtest validation in `build.sh`.
+- Treat ISO reader success as a separate prerequisite from memtest content.
+  If the reader cannot list or extract from the ISO, that is a validator
+  failure, not proof that memtest is missing.
+- If built-in integration does not produce the artifacts above, use a
+  deterministic project-owned copy/extract step instead of hoping live-build
+  will "start working".
+- Do not switch back to built-in-only memtest without fresh build evidence from
+  a real ISO.
+- If you reference memtest files manually, verify the exact package file list
+  first for the target Debian release.
+
+Known bad loops for this repository:
+
+- Do not retry built-in-only memtest without new evidence. We already proved
+  that `lb binary_memtest` can run while the final ISO still has no memtest.
+- Do not assume fixing memtest file names is enough. Correct names did not fix
+  the final artifact path.
+- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
+- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
+  bootloader configs.
+- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
+  timing is late enough to patch final `binary/boot/grub/grub.cfg` or
+  `binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
+  present yet when the hook executed.
+- Do not treat a validator warning as ground truth until you have confirmed the
+  ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
+  regression because the final ISO was correct but the validator produced a
+  false negative.
--- a/Show More
+++ b/Show More