Warn on PCIe link speed degradation and collect lspci -vvv in techdump

- collector/pcie: add applyPCIeLinkSpeedWarning that sets status=Warning and ErrorDescription when current link speed is below maximum negotiated speed (e.g. Gen1 running on a Gen5 slot) - collector/pcie: add pcieLinkSpeedRank helper for Gen string comparison - collector/pcie_filter_test: cover degraded and healthy link speed cases - platform/techdump: collect lspci -vvv → lspci-vvv.txt for LnkCap/LnkSta Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Add USB export drive and LiveCD-in-RAM checks to Runtime Health
2026-04-12 12:42:17 +03:00 · 2026-04-11 10:05:27 +03:00 · 2026-04-11 10:05:27 +03:00 · 2026-04-10 13:57:26 +03:00 · 2026-04-10 13:30:32 +03:00 · 2026-04-09 23:41:23 +03:00
65 changed files with 5886 additions and 1074 deletions
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 			archive, err = application.RunNvidiaAcceptancePack("", logLine)
 		}
 	case "memory":
-		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
+		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
 	case "storage":
-		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
+		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
 	case "cpu":
 		dur := *duration
 		if dur <= 0 {
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -117,13 +117,15 @@ type satRunner interface {
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
-	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
-	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
-	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
+	ResetNvidiaGPU(index int) (string, error)
+	RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
+	RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
 	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
 	DetectGPUVendor() string
@@ -188,6 +190,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 	}
 	result := collector.Run(runtimeMode)
 	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
+	writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -521,6 +524,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return a.sat.ListNvidiaGPUs()
 }

+func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
+	return a.sat.ListNvidiaGPUStatuses()
+}
+
+func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
+	out, err := a.sat.ResetNvidiaGPU(index)
+	return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
+}
+
 func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -555,11 +567,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }

-func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
 }

 func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -591,14 +603,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
 }

 func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
-	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
+	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
 }

-func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
+	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
 }

 func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
@@ -623,14 +635,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
 }

 func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
-	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
+	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
 }

-func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
+	return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
 }

 func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
@@ -915,6 +927,41 @@ func bodyOr(body, fallback string) string {
 	return body
 }

+// writePSUStatusesToDB records PSU statuses collected during audit into the
+// component-status DB so they are visible in the Hardware Summary card.
+// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
+func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
+	if db == nil || len(psus) == 0 {
+		return
+	}
+	const source = "audit:ipmi"
+	worstStatus := "OK"
+	for _, psu := range psus {
+		if psu.Status == nil {
+			continue
+		}
+		slot := "?"
+		if psu.Slot != nil {
+			slot = *psu.Slot
+		}
+		st := *psu.Status
+		detail := ""
+		if psu.ErrorDescription != nil {
+			detail = *psu.ErrorDescription
+		}
+		db.Record("psu:"+slot, source, st, detail)
+		switch st {
+		case "Critical":
+			worstStatus = "Critical"
+		case "Warning":
+			if worstStatus != "Critical" {
+				worstStatus = "Warning"
+			}
+		}
+	}
+	db.Record("psu:all", source, worstStatus, "")
+}
+
 func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -135,6 +135,8 @@ type fakeSAT struct {
 	listAMDGPUsFn             func() ([]platform.AMDGPUInfo, error)
 	runAMDPackFn              func(string) (string, error)
 	listNvidiaGPUsFn          func() ([]platform.NvidiaGPU, error)
+	listNvidiaGPUStatusesFn   func() ([]platform.NvidiaGPUStatus, error)
+	resetNvidiaGPUFn          func(int) (string, error)
 }

 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -159,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
 	return f.runNvidiaFn(baseDir)
 }

-func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
 	if f.runNvidiaComputeFn != nil {
 		return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
 	}
@@ -201,11 +203,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return nil, nil
 }

-func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
+	if f.listNvidiaGPUStatusesFn != nil {
+		return f.listNvidiaGPUStatusesFn()
+	}
+	return nil, nil
+}
+
+func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
+	if f.resetNvidiaGPUFn != nil {
+		return f.resetNvidiaGPUFn(index)
+	}
+	return "", nil
+}
+
+func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
 	return f.runMemoryFn(baseDir)
 }

-func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
 	return f.runStorageFn(baseDir)
 }

@@ -526,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
 }

 func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -564,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 }

 func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -627,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 }

 func TestRunSATDefaultsToExportDir(t *testing.T) {
-	t.Parallel()
-
 	oldSATBaseDir := DefaultSATBaseDir
 	DefaultSATBaseDir = "/tmp/export/bee-sat"
 	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
@@ -805,6 +815,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	for _, want := range []string{
 		"/system/ip-link.txt",
 		"/system/ip-link-stats.txt",
+		"/system/kernel-aer-nvidia.txt",
+		"/system/lspci-nvidia-bridges-vv.txt",
+		"/system/pcie-aer-sysfs.txt",
 		"/system/ethtool-info.txt",
 		"/system/ethtool-link.txt",
 		"/system/ethtool-module.txt",
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -3,6 +3,7 @@ package app
 import (
 	"os"
 	"path/filepath"
+	"strconv"
 	"sort"
 	"strings"

@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
 		applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
+		applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
 		applyMemorySAT(snap.Memory, summary)
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
 	applyComponentStatusDB(snap, db)
 }

+type nvidiaPerGPUStatus struct {
+	runStatus string
+	reason    string
+}
+
+func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
+	statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
+	if !ok {
+		return
+	}
+	for i := range devs {
+		if devs[i].Telemetry == nil {
+			continue
+		}
+		rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
+		if !ok {
+			continue
+		}
+		idx, ok := telemetryInt(rawIdx)
+		if !ok {
+			continue
+		}
+		st, ok := statusByIndex[idx]
+		if !ok {
+			continue
+		}
+		status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
+		if !ok {
+			continue
+		}
+		mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
+	}
+}
+
+func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
+	matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
+	if err != nil || len(matches) == 0 {
+		return nil, "", false
+	}
+	sort.Strings(matches)
+	runDir := matches[len(matches)-1]
+	summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return nil, "", false
+	}
+	summaryKV := parseKeyValueSummary(string(summaryRaw))
+	runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
+	files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
+	if err != nil || len(files) == 0 {
+		return nil, "", false
+	}
+	out := make(map[int]nvidiaPerGPUStatus, len(files))
+	for _, file := range files {
+		raw, err := os.ReadFile(file)
+		if err != nil {
+			continue
+		}
+		kv := parseKeyValueSummary(string(raw))
+		idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
+		if err != nil {
+			continue
+		}
+		out[idx] = nvidiaPerGPUStatus{
+			runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
+			reason:    strings.TrimSpace(kv["reason"]),
+		}
+	}
+	if len(out) == 0 {
+		return nil, "", false
+	}
+	return out, runAtUTC, true
+}
+
+func telemetryInt(v any) (int, bool) {
+	switch value := v.(type) {
+	case int:
+		return value, true
+	case int32:
+		return int(value), true
+	case int64:
+		return int(value), true
+	case float64:
+		return int(value), true
+	case string:
+		n, err := strconv.Atoi(strings.TrimSpace(value))
+		if err != nil {
+			return 0, false
+		}
+		return n, true
+	default:
+		return 0, false
+	}
+}
+
 type satSummary struct {
 	runAtUTC string
 	overall  string
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
 	}
 }

+func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
+	if component == nil || satStatus == "" {
+		return
+	}
+	current := strings.TrimSpace(ptrString(component.Status))
+	newSeverity := statusSeverity(satStatus)
+	currentSeverity := statusSeverity(current)
+	if current == "" || current == "Unknown" || newSeverity > currentSeverity {
+		mergeComponentStatus(component, changedAt, satStatus, description)
+		return
+	}
+	if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
+		component.Status = appStringPtr(satStatus)
+		component.ErrorDescription = appStringPtr(description)
+		if strings.TrimSpace(changedAt) != "" {
+			component.StatusChangedAt = appStringPtr(changedAt)
+			component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
+				Status:    satStatus,
+				ChangedAt: changedAt,
+				Details:   appStringPtr(description),
+			})
+		}
+	}
+}
+
 func statusSeverity(status string) int {
 	switch strings.TrimSpace(status) {
 	case "Critical":
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
 	}
 }
+
+func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
+	baseDir := t.TempDir()
+	runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	class := "VideoController"
+	manufacturer := "NVIDIA Corporation"
+	bdf0 := "0000:4b:00.0"
+	bdf1 := "0000:4f:00.0"
+	snap := schema.HardwareSnapshot{
+		PCIeDevices: []schema.HardwarePCIeDevice{
+			{
+				DeviceClass:  &class,
+				Manufacturer: &manufacturer,
+				BDF:          &bdf0,
+				Telemetry:    map[string]any{"nvidia_gpu_index": 0},
+			},
+			{
+				DeviceClass:  &class,
+				Manufacturer: &manufacturer,
+				BDF:          &bdf1,
+				Telemetry:    map[string]any{"nvidia_gpu_index": 1},
+			},
+		},
+	}
+
+	applyLatestSATStatuses(&snap, baseDir, nil)
+
+	if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
+		t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
+	}
+	if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
+		got := "<nil>"
+		if snap.PCIeDevices[1].ErrorDescription != nil {
+			got = *snap.PCIeDevices[1].ErrorDescription
+		}
+		t.Fatalf("gpu1 error=%q want per-gpu reason", got)
+	}
+}
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -40,17 +40,75 @@ var supportBundleCommands = []struct {
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
 	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
+	{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
+if command -v dmesg >/dev/null 2>&1; then
+  dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
+else
+  echo "dmesg not found"
+fi
+`}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
+if ! command -v lspci >/dev/null 2>&1; then
+  echo "lspci not found"
+  exit 0
+fi
+found=0
+	for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
+  found=1
+  echo "=== GPU $gpu ==="
+  lspci -s "$gpu" -vv 2>&1 || true
+  bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
+  if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
+    echo
+    echo "=== UPSTREAM $bridge for $gpu ==="
+    lspci -s "$bridge" -vv 2>&1 || true
+  fi
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no NVIDIA PCI devices found"
+fi
+`}},
 	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
 for d in /sys/bus/pci/devices/*/; do
  vendor=$(cat "$d/vendor" 2>/dev/null)
-  [ "$vendor" = "0x10de" ] || continue
-  dev=$(basename "$d")
+	  [ "$vendor" = "0x10de" ] || continue
+	  class=$(cat "$d/class" 2>/dev/null)
+	  case "$class" in
+	    0x030000|0x030200) ;;
+	    *) continue ;;
+	  esac
+	  dev=$(basename "$d")
  echo "=== $dev ==="
  for f in current_link_speed current_link_width max_link_speed max_link_width; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
  done
 done
+`}},
+	{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
+found=0
+for dev in /sys/bus/pci/devices/*; do
+  [ -e "$dev" ] || continue
+  bdf=$(basename "$dev")
+  block=""
+  for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
+    if [ -r "$dev/$f" ]; then
+      if [ -z "$block" ]; then
+        block=1
+        found=1
+        echo "=== $bdf ==="
+      fi
+      printf "  %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
+    fi
+  done
+  if [ -n "$block" ]; then
+    echo
+  fi
+done
+if [ "$found" -eq 0 ]; then
+  echo "no PCIe AER sysfs counters found"
+fi
 `}},
 	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
@@ -139,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
 }

-const supportBundleGlob = "bee-support-*.tar.gz"
+const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"

 func BuildSupportBundle(exportDir string) (string, error) {
 	exportDir = strings.TrimSpace(exportDir)
@@ -153,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	host := sanitizeFilename(hostnameOr("unknown"))
-	ts := time.Now().UTC().Format("20060102-150405")
-	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
+	now := time.Now().UTC()
+	date := now.Format("2006-01-02")
+	tod := now.Format("150405")
+	ver := bundleVersion()
+	model := serverModelForBundle()
+	sn := serverSerialForBundle()
+
+	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
 		return "", err
 	}
@@ -187,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
+	archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
+	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
 	}
@@ -344,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	return os.WriteFile(dst, []byte(body.String()), 0644)
 }

+func bundleVersion() string {
+	v := buildVersion()
+	v = strings.TrimPrefix(v, "v")
+	v = strings.TrimPrefix(v, "V")
+	if v == "" || v == "unknown" {
+		return "0.0"
+	}
+	return v
+}
+
+func serverModelForBundle() string {
+	raw, err := exec.Command("dmidecode", "-t", "1").Output()
+	if err != nil {
+		return "unknown"
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		key, val, ok := strings.Cut(line, ": ")
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(key) == "Product Name" {
+			val = strings.TrimSpace(val)
+			if val == "" {
+				return "unknown"
+			}
+			return strings.ReplaceAll(val, " ", "_")
+		}
+	}
+	return "unknown"
+}
+
+func serverSerialForBundle() string {
+	raw, err := exec.Command("dmidecode", "-t", "1").Output()
+	if err != nil {
+		return "unknown"
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		key, val, ok := strings.Cut(line, ": ")
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(key) == "Serial Number" {
+			val = strings.TrimSpace(val)
+			if val == "" {
+				return "unknown"
+			}
+			return val
+		}
+	}
+	return "unknown"
+}
+
 func buildVersion() string {
 	raw, err := exec.Command("bee", "version").CombinedOutput()
 	if err != nil {
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
 	defer cancel()
 	return exec.CommandContext(ctx, name, args...).Output()
 }
-
-func interfaceHasCarrier(iface string) bool {
-	raw, err := readNetCarrierFile(iface)
-	if err != nil {
-		return false
-	}
-	return strings.TrimSpace(raw) == "1"
-}
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}

-		if interfaceHasCarrier(iface) {
-			if out, err := ethtoolModuleQuery(iface); err == nil {
-				if injectSFPDOMTelemetry(&devs[i], out) {
-					enriched++
-					continue
-				}
+		if out, err := ethtoolModuleQuery(iface); err == nil {
+			if injectSFPDOMTelemetry(&devs[i], out) {
+				enriched++
+				continue
 			}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 		}
 		key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
 		val := strings.TrimSpace(trimmed[idx+1:])
+		if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
+			continue
+		}

 		switch {
+		case key == "identifier":
+			s := parseSFPIdentifier(val)
+			dev.SFPIdentifier = &s
+			t := true
+			dev.SFPPresent = &t
+			changed = true
+		case key == "connector":
+			s := parseSFPConnector(val)
+			dev.SFPConnector = &s
+			changed = true
+		case key == "vendor name":
+			s := strings.TrimSpace(val)
+			dev.SFPVendor = &s
+			changed = true
+		case key == "vendor pn":
+			s := strings.TrimSpace(val)
+			dev.SFPPartNumber = &s
+			changed = true
+		case key == "vendor sn":
+			s := strings.TrimSpace(val)
+			dev.SFPSerialNumber = &s
+			changed = true
+		case strings.Contains(key, "laser wavelength"):
+			if f, ok := firstFloat(val); ok {
+				dev.SFPWavelengthNM = &f
+				changed = true
+			}
 		case strings.Contains(key, "module temperature"):
 			if f, ok := firstFloat(val); ok {
 				dev.SFPTemperatureC = &f
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 	return changed
 }

+// parseSFPIdentifier extracts the human-readable transceiver type from the
+// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
+func parseSFPIdentifier(val string) string {
+	if s := extractParens(val); s != "" {
+		return s
+	}
+	return val
+}
+
+// parseSFPConnector extracts the connector type from the raw ethtool line,
+// e.g. "0x07 (LC)" → "LC".
+func parseSFPConnector(val string) string {
+	if s := extractParens(val); s != "" {
+		return s
+	}
+	return val
+}
+
+var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
+
+func extractParens(s string) string {
+	m := parenRe.FindStringSubmatch(s)
+	if len(m) < 2 {
+		return ""
+	}
+	return strings.TrimSpace(m[1])
+}
+
 func parseSFPDOM(raw string) map[string]any {
 	dev := schema.HardwarePCIeDevice{}
 	if !injectSFPDOMTelemetry(&dev, raw) {
 		return map[string]any{}
 	}
 	out := map[string]any{}
+	if dev.SFPPresent != nil {
+		out["sfp_present"] = *dev.SFPPresent
+	}
+	if dev.SFPIdentifier != nil {
+		out["sfp_identifier"] = *dev.SFPIdentifier
+	}
+	if dev.SFPConnector != nil {
+		out["sfp_connector"] = *dev.SFPConnector
+	}
+	if dev.SFPVendor != nil {
+		out["sfp_vendor"] = *dev.SFPVendor
+	}
+	if dev.SFPPartNumber != nil {
+		out["sfp_part_number"] = *dev.SFPPartNumber
+	}
+	if dev.SFPSerialNumber != nil {
+		out["sfp_serial_number"] = *dev.SFPSerialNumber
+	}
+	if dev.SFPWavelengthNM != nil {
+		out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
+	}
 	if dev.SFPTemperatureC != nil {
 		out["sfp_temperature_c"] = *dev.SFPTemperatureC
 	}
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
 	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
 	readNetCarrierFile = func(string) (string, error) { return "0", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
-	ethtoolModuleQuery = func(string) (string, error) {
-		t.Fatal("ethtool -m should not be called without carrier")
-		return "", nil
-	}
+	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }

 	class := "EthernetController"
 	bdf := "0000:18:00.0"
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -13,7 +13,9 @@ import (
 const nvidiaVendorID = 0x10de

 type nvidiaGPUInfo struct {
+	Index              int
 	BDF                string
+	Name               string
 	Serial             string
 	VBIOS              string
 	TemperatureC       *float64
@@ -72,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 			continue
 		}

+		if v := strings.TrimSpace(info.Name); v != "" {
+			devs[i].Model = &v
+		}
 		if v := strings.TrimSpace(info.Serial); v != "" {
 			devs[i].SerialNumber = &v
 		}
@@ -98,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
 	out, err := exec.Command(
 		"nvidia-smi",
-		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
+		"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
@@ -122,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		if len(rec) == 0 {
 			continue
 		}
-		if len(rec) < 13 {
-			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
+		if len(rec) < 14 {
+			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
 		}

 		bdf := normalizePCIeBDF(rec[1])
@@ -132,18 +137,20 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		}

 		info := nvidiaGPUInfo{
+			Index:              parseRequiredInt(rec[0]),
 			BDF:                bdf,
-			Serial:             strings.TrimSpace(rec[2]),
-			VBIOS:              strings.TrimSpace(rec[3]),
-			TemperatureC:       parseMaybeFloat(rec[4]),
-			PowerW:             parseMaybeFloat(rec[5]),
-			ECCUncorrected:     parseMaybeInt64(rec[6]),
-			ECCCorrected:       parseMaybeInt64(rec[7]),
-			HWSlowdown:         parseMaybeBool(rec[8]),
-			PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
-			PCIeLinkGenMax:     parseMaybeInt(rec[10]),
-			PCIeLinkWidthCur:   parseMaybeInt(rec[11]),
-			PCIeLinkWidthMax:   parseMaybeInt(rec[12]),
+			Name:               strings.TrimSpace(rec[2]),
+			Serial:             strings.TrimSpace(rec[3]),
+			VBIOS:              strings.TrimSpace(rec[4]),
+			TemperatureC:       parseMaybeFloat(rec[5]),
+			PowerW:             parseMaybeFloat(rec[6]),
+			ECCUncorrected:     parseMaybeInt64(rec[7]),
+			ECCCorrected:       parseMaybeInt64(rec[8]),
+			HWSlowdown:         parseMaybeBool(rec[9]),
+			PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
+			PCIeLinkGenMax:     parseMaybeInt(rec[11]),
+			PCIeLinkWidthCur:   parseMaybeInt(rec[12]),
+			PCIeLinkWidthMax:   parseMaybeInt(rec[13]),
 		}
 		result[bdf] = info
 	}
@@ -187,6 +194,14 @@ func parseMaybeInt(v string) *int {
 	return &n
 }

+func parseRequiredInt(v string) int {
+	n, err := strconv.Atoi(strings.TrimSpace(v))
+	if err != nil {
+		return 0
+	}
+	return n
+}
+
 func pcieLinkGenLabel(gen int) string {
 	return fmt.Sprintf("Gen%d", gen)
 }
@@ -240,6 +255,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
 }

 func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
+	if dev.Telemetry == nil {
+		dev.Telemetry = map[string]any{}
+	}
+	dev.Telemetry["nvidia_gpu_index"] = info.Index
 	if info.TemperatureC != nil {
 		dev.TemperatureC = info.TemperatureC
 	}
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -6,7 +6,7 @@ import (
 )

 func TestParseNVIDIASMIQuery(t *testing.T) {
-	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
+	raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
 	byBDF, err := parseNVIDIASMIQuery(raw)
 	if err != nil {
 		t.Fatalf("parse failed: %v", err)
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
 	if !ok {
 		t.Fatalf("gpu by normalized bdf not found")
 	}
+	if gpu.Name != "NVIDIA H100 80GB HBM3" {
+		t.Fatalf("name: got %q", gpu.Name)
+	}
 	if gpu.Serial != "GPU-SERIAL-1" {
 		t.Fatalf("serial: got %q", gpu.Serial)
 	}
@@ -86,6 +89,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
 	if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
 		t.Fatalf("firmware: got %v", out[0].Firmware)
 	}
+	if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
+		t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
+	}
 	if out[0].Status == nil || *out[0].Status != statusWarning {
 		t.Fatalf("status: got %v", out[0].Status)
 	}
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -2,6 +2,7 @@ package collector

 import (
 	"bee/audit/internal/schema"
+	"fmt"
 	"log/slog"
 	"os/exec"
 	"strconv"
@@ -79,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
 		}
 	}

+	// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
+	// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
+	if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
+		bmcPatterns := []string{
+			"management system chip",
+			"management controller",
+			"ibmc",
+			"idrac",
+			"ilo vga",
+			"aspeed",
+			"matrox",
+		}
+		for _, bad := range bmcPatterns {
+			if strings.Contains(d, bad) {
+				return false
+			}
+		}
+	}
+
 	if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
 		internalAMDPatterns := []string{
 			"dummy function",
@@ -153,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {

 	// SVendor/SDevice available but not in schema — skip

+	// Warn if PCIe link is running below its maximum negotiated speed.
+	applyPCIeLinkSpeedWarning(&dev)
+
 	return dev
 }

@@ -222,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
 	return value, true
 }

+// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
+// speed is below the maximum negotiated speed supported by both ends.
+func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
+	if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
+		return
+	}
+	if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
+		warn := statusWarning
+		dev.Status = &warn
+		desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
+		dev.ErrorDescription = &desc
+	}
+}
+
+// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
+// Returns 0 for unrecognised values so comparisons fail safe.
+func pcieLinkSpeedRank(gen string) int {
+	switch gen {
+	case "Gen1":
+		return 1
+	case "Gen2":
+		return 2
+	case "Gen3":
+		return 3
+	case "Gen4":
+		return 4
+	case "Gen5":
+		return 5
+	case "Gen6":
+		return 6
+	default:
+		return 0
+	}
+}
+
 func normalizePCILinkSpeed(raw string) string {
 	raw = strings.TrimSpace(strings.ToLower(raw))
 	switch {
--- a/audit/internal/collector/pcie_filter_test.go
+++ b/audit/internal/collector/pcie_filter_test.go
@@ -1,6 +1,7 @@
 package collector

 import (
+	"bee/audit/internal/schema"
 	"encoding/json"
 	"strings"
 	"testing"
@@ -29,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
 		{name: "raid", class: "RAID bus controller", want: true},
 		{name: "nvme", class: "Non-Volatile memory controller", want: true},
 		{name: "vga", class: "VGA compatible controller", want: true},
+		{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
+		{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
 		{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
 	}

@@ -139,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
 		}
 	}
 }
+
+func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
+	ptr := func(s string) *string { return &s }
+
+	tests := []struct {
+		name        string
+		linkSpeed   *string
+		maxSpeed    *string
+		wantWarning bool
+		wantGenIn   string // substring expected in ErrorDescription when warning
+	}{
+		{
+			name:        "degraded Gen1 vs Gen5",
+			linkSpeed:   ptr("Gen1"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: true,
+			wantGenIn:   "Gen1",
+		},
+		{
+			name:        "at max Gen5",
+			linkSpeed:   ptr("Gen5"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: false,
+		},
+		{
+			name:        "degraded Gen4 vs Gen5",
+			linkSpeed:   ptr("Gen4"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: true,
+			wantGenIn:   "Gen4",
+		},
+		{
+			name:        "missing current speed — no warning",
+			linkSpeed:   nil,
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: false,
+		},
+		{
+			name:        "missing max speed — no warning",
+			linkSpeed:   ptr("Gen1"),
+			maxSpeed:    nil,
+			wantWarning: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			dev := schema.HardwarePCIeDevice{}
+			ok := statusOK
+			dev.Status = &ok
+			dev.LinkSpeed = tt.linkSpeed
+			dev.MaxLinkSpeed = tt.maxSpeed
+
+			applyPCIeLinkSpeedWarning(&dev)
+
+			gotWarn := dev.Status != nil && *dev.Status == statusWarning
+			if gotWarn != tt.wantWarning {
+				t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
+			}
+			if tt.wantWarning {
+				if dev.ErrorDescription == nil {
+					t.Fatal("expected ErrorDescription to be set")
+				}
+				if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
+					t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
+				}
+			} else {
+				if dev.ErrorDescription != nil {
+					t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
+				}
+			}
+		})
+	}
+}
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -27,14 +27,17 @@ type benchmarkProfileSpec struct {
 }

 type benchmarkGPUInfo struct {
-	Index               int
-	UUID                string
-	Name                string
-	BusID               string
-	VBIOS               string
-	PowerLimitW         float64
-	MaxGraphicsClockMHz float64
-	MaxMemoryClockMHz   float64
+	Index                int
+	UUID                 string
+	Name                 string
+	BusID                string
+	VBIOS                string
+	PowerLimitW          float64
+	DefaultPowerLimitW   float64
+	MaxGraphicsClockMHz  float64
+	MaxMemoryClockMHz    float64
+	BaseGraphicsClockMHz float64
+	MultiprocessorCount  int
 }

 type benchmarkBurnProfile struct {
@@ -102,7 +105,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		BenchmarkVersion:   benchmarkVersion,
 		GeneratedAt:        time.Now().UTC(),
 		Hostname:           hostname,
+		ServerModel:        readServerModel(),
 		BenchmarkProfile:   spec.Name,
+		ParallelGPUs:       opts.ParallelGPUs,
 		SelectedGPUIndices: append([]int(nil), selected...),
 		Normalization: BenchmarkNormalization{
 			Status: "full",
@@ -111,6 +116,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv

 	logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))

+	// Server power characterization state — populated during per-GPU phases.
+	var serverIdleW, serverLoadedWSum float64
+	var serverIdleOK, serverLoadedOK bool
+	var serverLoadedSamples int
+
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
@@ -135,6 +145,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()

+	if opts.ParallelGPUs {
+		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
+	} else {
+
 	for _, idx := range selected {
 		gpuResult := BenchmarkGPUResult{
 			Index:  idx,
@@ -146,7 +160,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			gpuResult.BusID = info.BusID
 			gpuResult.VBIOS = info.VBIOS
 			gpuResult.PowerLimitW = info.PowerLimitW
+			gpuResult.MultiprocessorCount = info.MultiprocessorCount
+			gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
 			gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
+			gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
@@ -161,6 +178,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)

+		// Sample server idle power once (first GPU only — server state is global).
+		if !serverIdleOK {
+			if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+				serverIdleW = w
+				serverIdleOK = true
+				logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+			}
+		}
+
 		warmupCmd := []string{
 			"bee-gpu-burn",
 			"--seconds", strconv.Itoa(spec.WarmupSec),
@@ -184,7 +210,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			"--devices", strconv.Itoa(idx),
 		}
 		logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
+
+		// Sample server power via IPMI in parallel with the steady phase.
+		// We collect readings every 5s and average them.
+		ipmiStopCh := make(chan struct{})
+		ipmiResultCh := make(chan float64, 1)
+		go func() {
+			defer close(ipmiResultCh)
+			var samples []float64
+			ticker := time.NewTicker(5 * time.Second)
+			defer ticker.Stop()
+			// First sample after a short warmup delay.
+			select {
+			case <-ipmiStopCh:
+				return
+			case <-time.After(15 * time.Second):
+			}
+			for {
+				if w, err := queryIPMIServerPowerW(); err == nil {
+					samples = append(samples, w)
+				}
+				select {
+				case <-ipmiStopCh:
+					if len(samples) > 0 {
+						var sum float64
+						for _, w := range samples {
+							sum += w
+						}
+						ipmiResultCh <- sum / float64(len(samples))
+					}
+					return
+				case <-ticker.C:
+				}
+			}
+		}()
+
 		steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
+		close(ipmiStopCh)
+		if loadedW, ok := <-ipmiResultCh; ok {
+			serverLoadedWSum += loadedW
+			serverLoadedSamples++
+			serverLoadedOK = true
+			logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
+		}
+
 		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
 		afterThrottle, _ := queryThrottleCounters(idx)
 		if steadyErr != nil {
@@ -222,6 +291,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
 	}

+	} // end sequential path
+
 	if len(selected) > 1 && opts.RunNCCL {
 		result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
 		if result.Interconnect != nil && result.Interconnect.Supported {
@@ -232,6 +303,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}

+	// Compute server power characterization from accumulated IPMI samples.
+	var gpuReportedSumW float64
+	for _, gpu := range result.GPUs {
+		gpuReportedSumW += gpu.Steady.AvgPowerW
+	}
+	var serverLoadedW float64
+	if serverLoadedSamples > 0 {
+		serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
+	}
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
+
 	result.Findings = buildBenchmarkFindings(result)
 	result.OverallStatus = benchmarkOverallStatus(result)

@@ -243,9 +325,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		return "", fmt.Errorf("write result.json: %w", err)
 	}

-	report := renderBenchmarkReport(result)
-	if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
-		return "", fmt.Errorf("write report.txt: %w", err)
+	report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
+	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
+		return "", fmt.Errorf("write report.md: %w", err)
 	}

 	summary := renderBenchmarkSummary(result)
@@ -253,11 +335,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		return "", fmt.Errorf("write summary.txt: %w", err)
 	}

-	archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", fmt.Errorf("pack benchmark archive: %w", err)
-	}
-	return archive, nil
+	return runDir, nil
 }

 func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
@@ -288,50 +366,87 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 	}
 }

-func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
-	args := []string{
-		"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
-		"--format=csv,noheader,nounits",
-	}
-	if len(gpuIndices) > 0 {
-		args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
-	}
-	out, err := satExecCommand("nvidia-smi", args...).Output()
-	if err != nil {
-		return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
-	}
-
-	r := csv.NewReader(strings.NewReader(string(out)))
-	r.TrimLeadingSpace = true
-	r.FieldsPerRecord = -1
-	rows, err := r.ReadAll()
-	if err != nil {
-		return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
-	}
-
-	infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
-	for _, row := range rows {
-		if len(row) < 8 {
-			continue
-		}
-		idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
-		if err != nil {
-			continue
-		}
-		infoByIndex[idx] = benchmarkGPUInfo{
-			Index:               idx,
-			UUID:                strings.TrimSpace(row[1]),
-			Name:                strings.TrimSpace(row[2]),
-			BusID:               strings.TrimSpace(row[3]),
-			VBIOS:               strings.TrimSpace(row[4]),
-			PowerLimitW:         parseBenchmarkFloat(row[5]),
-			MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
-			MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
-		}
-	}
-	return infoByIndex, nil
+// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
+// Fields are tried in order; the first successful query wins. Extended fields
+// (attribute.multiprocessor_count, power.default_limit) are not supported on
+// all driver versions, so we fall back to the base set if the full query fails.
+var benchmarkGPUInfoQueries = []struct {
+	fields   string
+	extended bool // whether this query includes optional extended fields
+}{
+	{
+		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
+		extended: true,
+	},
+	{
+		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
+		extended: false,
+	},
 }

+func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
+	var lastErr error
+	for _, q := range benchmarkGPUInfoQueries {
+		args := []string{
+			"--query-gpu=" + q.fields,
+			"--format=csv,noheader,nounits",
+		}
+		if len(gpuIndices) > 0 {
+			args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
+		}
+		out, err := satExecCommand("nvidia-smi", args...).Output()
+		if err != nil {
+			lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
+			continue
+		}
+
+		r := csv.NewReader(strings.NewReader(string(out)))
+		r.TrimLeadingSpace = true
+		r.FieldsPerRecord = -1
+		rows, err := r.ReadAll()
+		if err != nil {
+			lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
+			continue
+		}
+
+		infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
+		for _, row := range rows {
+			if len(row) < 9 {
+				continue
+			}
+			idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
+			if err != nil {
+				continue
+			}
+			info := benchmarkGPUInfo{
+				Index:               idx,
+				UUID:                strings.TrimSpace(row[1]),
+				Name:                strings.TrimSpace(row[2]),
+				BusID:               strings.TrimSpace(row[3]),
+				VBIOS:               strings.TrimSpace(row[4]),
+				PowerLimitW:         parseBenchmarkFloat(row[5]),
+				MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
+				MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
+			}
+			if len(row) >= 9 {
+				info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
+			}
+			if q.extended {
+				if len(row) >= 10 {
+					info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
+				}
+				if len(row) >= 11 {
+					info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
+				}
+			}
+			infoByIndex[idx] = info
+		}
+		return infoByIndex, nil
+	}
+	return nil, lastErr
+}
+
+
 func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
 	if os.Geteuid() != 0 {
 		result.Normalization.Status = "partial"
@@ -370,6 +485,10 @@ func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndi
 					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
 				}})
 			}
+		} else {
+			rec.GPUClockLockStatus = "skipped"
+			rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
+			result.Normalization.Status = "partial"
 		}

 		if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
@@ -551,6 +670,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
 	}
 	category := "other"
 	switch {
+	case strings.HasPrefix(name, "fp64"):
+		category = "fp64"
 	case strings.HasPrefix(name, "fp32"):
 		category = "fp32_tf32"
 	case strings.HasPrefix(name, "fp16"):
@@ -619,14 +740,23 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 			score.ComputeScore += precision.TeraOpsPerSec
 		}
 	}
-	if gpu.PowerLimitW > 0 {
-		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100)
+	// Use default power limit for sustain score so a manually reduced limit
+	// does not inflate the score. Fall back to enforced limit if default unknown.
+	referencePowerW := gpu.DefaultPowerLimitW
+	if referencePowerW <= 0 {
+		referencePowerW = gpu.PowerLimitW
+	}
+	if referencePowerW > 0 {
+		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
 	}
 	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
 	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
 	score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
 	score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
 	score.CompositeScore = compositeBenchmarkScore(score)
+	if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
+		score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
+	}
 	return score
 }

@@ -679,7 +809,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
 		"-g", strconv.Itoa(len(gpuIndices)),
 		"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
 	}
-	env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	env := []string{
+		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
+		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
+	}
 	logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
 	out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
@@ -795,10 +928,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {

 func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 	var findings []string
+
+	passed := 0
+	for _, gpu := range result.GPUs {
+		if gpu.Status == "OK" {
+			passed++
+		}
+	}
+	total := len(result.GPUs)
+	if total > 0 {
+		if passed == total {
+			findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
+		} else {
+			findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
+		}
+	}
+
 	if result.Normalization.Status != "full" {
 		findings = append(findings, "Environment normalization was partial; compare results with caution.")
 	}
 	for _, gpu := range result.GPUs {
+		if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
+			findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
+			continue
+		}
 		if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
 			findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
 			continue
@@ -822,10 +975,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 		if gpu.Backend == "driver-ptx" {
 			findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
 		}
+		if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
+			findings = append(findings, fmt.Sprintf(
+				"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
+				gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
+			))
+		}
 	}
 	if result.Interconnect != nil && result.Interconnect.Supported {
 		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
 	}
+	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
+		if sp.ReportingRatio < 0.75 {
+			findings = append(findings, fmt.Sprintf(
+				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
+				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
+			))
+		}
+	}
 	return dedupeStrings(findings)
 }

@@ -1004,3 +1171,309 @@ func maxInt(a, b int) int {
 	}
 	return b
 }
+
+// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
+// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
+func queryIPMIServerPowerW() (float64, error) {
+	out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
+	if err != nil {
+		return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
+	}
+	if w := parseDCMIPowerReading(string(out)); w > 0 {
+		return w, nil
+	}
+	return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
+}
+
+// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
+// durationSec seconds. Returns the mean of all successful samples.
+// Returns 0, false if IPMI is unavailable.
+func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
+	if durationSec <= 0 {
+		return 0, false
+	}
+	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
+	var samples []float64
+	for {
+		if w, err := queryIPMIServerPowerW(); err == nil {
+			samples = append(samples, w)
+		}
+		if time.Now().After(deadline) {
+			break
+		}
+		select {
+		case <-ctx.Done():
+			break
+		case <-time.After(2 * time.Second):
+		}
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	var sum float64
+	for _, w := range samples {
+		sum += w
+	}
+	return sum / float64(len(samples)), true
+}
+
+// characterizeServerPower computes BenchmarkServerPower from idle and loaded
+// IPMI samples plus the GPU-reported average power during steady state.
+func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
+	sp := &BenchmarkServerPower{Available: ipmiAvailable}
+	if !ipmiAvailable {
+		sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
+		return sp
+	}
+	sp.IdleW = idleW
+	sp.LoadedW = loadedW
+	sp.DeltaW = loadedW - idleW
+	sp.GPUReportedSumW = gpuReportedSumW
+	if gpuReportedSumW > 0 && sp.DeltaW > 0 {
+		sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
+	}
+	return sp
+}
+
+// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
+// Returns empty string if unavailable (non-Linux or missing DMI entry).
+func readServerModel() string {
+	data, err := os.ReadFile("/sys/class/dmi/id/product_name")
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(data))
+}
+
+// filterRowsByGPU returns only the metric rows for a specific GPU index.
+func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
+	var out []GPUMetricRow
+	for _, r := range rows {
+		if r.GPUIndex == gpuIndex {
+			out = append(out, r)
+		}
+	}
+	return out
+}
+
+// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
+// and returns a per-GPU parse result map.
+func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
+	gpuLines := make(map[int][]string)
+	for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
+		line = strings.TrimSpace(line)
+		if !strings.HasPrefix(line, "[gpu ") {
+			continue
+		}
+		end := strings.Index(line, "] ")
+		if end < 0 {
+			continue
+		}
+		gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
+		if err != nil {
+			continue
+		}
+		gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
+	}
+	results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
+	for gpuIdx, lines := range gpuLines {
+		// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
+		// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
+		results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
+	}
+	return results
+}
+
+// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
+// simultaneously using a single bee-gpu-burn invocation per phase.
+func runNvidiaBenchmarkParallel(
+	ctx context.Context,
+	verboseLog, runDir string,
+	selected []int,
+	infoByIndex map[int]benchmarkGPUInfo,
+	opts NvidiaBenchmarkOptions,
+	spec benchmarkProfileSpec,
+	logFunc func(string),
+	result *NvidiaBenchmarkResult,
+	serverIdleW *float64, serverLoadedWSum *float64,
+	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
+) {
+	allDevices := joinIndexList(selected)
+
+	// Build per-GPU result stubs.
+	gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
+	for _, idx := range selected {
+		r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
+		if info, ok := infoByIndex[idx]; ok {
+			r.UUID = info.UUID
+			r.Name = info.Name
+			r.BusID = info.BusID
+			r.VBIOS = info.VBIOS
+			r.PowerLimitW = info.PowerLimitW
+			r.MultiprocessorCount = info.MultiprocessorCount
+			r.DefaultPowerLimitW = info.DefaultPowerLimitW
+			r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
+			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
+			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
+		}
+		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
+			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
+			r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
+		}
+		gpuResults[idx] = r
+	}
+
+	// Baseline: sample all GPUs together.
+	baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
+	if err != nil && err != context.Canceled {
+		for _, idx := range selected {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
+		}
+	}
+	for _, idx := range selected {
+		perGPU := filterRowsByGPU(baselineRows, idx)
+		gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
+	}
+
+	// Sample server idle power once.
+	if !*serverIdleOK {
+		if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+			*serverIdleW = w
+			*serverIdleOK = true
+			logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+		}
+	}
+
+	// Warmup: all GPUs simultaneously.
+	warmupCmd := []string{
+		"bee-gpu-burn",
+		"--seconds", strconv.Itoa(spec.WarmupSec),
+		"--size-mb", strconv.Itoa(opts.SizeMB),
+		"--devices", allDevices,
+	}
+	logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
+	warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
+	for _, idx := range selected {
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
+	}
+	if warmupErr != nil {
+		for _, idx := range selected {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
+		}
+	}
+
+	// Snapshot throttle counters before steady.
+	beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
+	for _, idx := range selected {
+		beforeThrottle[idx], _ = queryThrottleCounters(idx)
+	}
+
+	// Steady: all GPUs simultaneously.
+	steadyCmd := []string{
+		"bee-gpu-burn",
+		"--seconds", strconv.Itoa(spec.SteadySec),
+		"--size-mb", strconv.Itoa(opts.SizeMB),
+		"--devices", allDevices,
+	}
+	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
+
+	// Sample server power via IPMI in parallel with steady phase.
+	ipmiStopCh := make(chan struct{})
+	ipmiResultCh := make(chan float64, 1)
+	go func() {
+		defer close(ipmiResultCh)
+		var samples []float64
+		ticker := time.NewTicker(5 * time.Second)
+		defer ticker.Stop()
+		select {
+		case <-ipmiStopCh:
+			return
+		case <-time.After(15 * time.Second):
+		}
+		for {
+			if w, err := queryIPMIServerPowerW(); err == nil {
+				samples = append(samples, w)
+			}
+			select {
+			case <-ipmiStopCh:
+				if len(samples) > 0 {
+					var sum float64
+					for _, w := range samples {
+						sum += w
+					}
+					ipmiResultCh <- sum / float64(len(samples))
+				}
+				return
+			case <-ticker.C:
+			}
+		}
+	}()
+
+	steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
+	close(ipmiStopCh)
+	if loadedW, ok := <-ipmiResultCh; ok {
+		*serverLoadedWSum += loadedW
+		(*serverLoadedSamples)++
+		*serverLoadedOK = true
+		logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
+	}
+	_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
+
+	afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
+	for _, idx := range selected {
+		afterThrottle[idx], _ = queryThrottleCounters(idx)
+	}
+
+	parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
+
+	for _, idx := range selected {
+		perGPU := filterRowsByGPU(steadyRows, idx)
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
+		gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
+		gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
+
+		if pr, ok := parseResults[idx]; ok {
+			gpuResults[idx].ComputeCapability = pr.ComputeCapability
+			gpuResults[idx].Backend = pr.Backend
+			gpuResults[idx].PrecisionResults = pr.Profiles
+			if pr.Fallback {
+				gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
+			}
+		}
+		if steadyErr != nil {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
+		}
+	}
+
+	// Cooldown: all GPUs together.
+	cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
+	if err != nil && err != context.Canceled {
+		for _, idx := range selected {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
+		}
+	}
+	for _, idx := range selected {
+		perGPU := filterRowsByGPU(cooldownRows, idx)
+		gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
+	}
+
+	// Score and finalize each GPU.
+	for _, idx := range selected {
+		r := gpuResults[idx]
+		r.Scores = scoreBenchmarkGPUResult(*r)
+		r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
+		pr := parseResults[idx]
+		switch {
+		case steadyErr != nil:
+			r.Status = classifySATErrorStatus(steadyOut, steadyErr)
+		case pr.Fallback:
+			r.Status = "PARTIAL"
+		default:
+			r.Status = "OK"
+		}
+		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
+	}
+}
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -2,24 +2,73 @@ package platform

 import (
 	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
 	"strings"
 	"time"
 )

 func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
-	var b strings.Builder
-	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
-	fmt.Fprintf(&b, "===========================\n\n")
-	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
-	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
-	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
-	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
-	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
+	return renderBenchmarkReportWithCharts(result, nil)
+}

+type benchmarkReportChart struct {
+	Title   string
+	Content string
+}
+
+var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
+
+func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
+	var b strings.Builder
+
+	// ── Header ────────────────────────────────────────────────────────────────
+	b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
+
+	// System identity block
+	if result.ServerModel != "" {
+		fmt.Fprintf(&b, "**Server:** %s  \n", result.ServerModel)
+	}
+	if result.Hostname != "" {
+		fmt.Fprintf(&b, "**Host:** %s  \n", result.Hostname)
+	}
+	// GPU models summary
+	if len(result.GPUs) > 0 {
+		modelCount := make(map[string]int)
+		var modelOrder []string
+		for _, g := range result.GPUs {
+			m := strings.TrimSpace(g.Name)
+			if m == "" {
+				m = "Unknown GPU"
+			}
+			if modelCount[m] == 0 {
+				modelOrder = append(modelOrder, m)
+			}
+			modelCount[m]++
+		}
+		var parts []string
+		for _, m := range modelOrder {
+			if modelCount[m] == 1 {
+				parts = append(parts, m)
+			} else {
+				parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
+			}
+		}
+		fmt.Fprintf(&b, "**GPU(s):** %s  \n", strings.Join(parts, ", "))
+	}
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	if result.ParallelGPUs {
+		fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously)  \n")
+	}
+	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
+	b.WriteString("\n")
+
+	// ── Executive Summary ─────────────────────────────────────────────────────
 	if len(result.Findings) > 0 {
-		fmt.Fprintf(&b, "Executive Summary\n")
-		fmt.Fprintf(&b, "-----------------\n")
+		b.WriteString("## Executive Summary\n\n")
 		for _, finding := range result.Findings {
 			fmt.Fprintf(&b, "- %s\n", finding)
 		}
@@ -27,96 +76,250 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 	}

 	if len(result.Warnings) > 0 {
-		fmt.Fprintf(&b, "Warnings\n")
-		fmt.Fprintf(&b, "--------\n")
+		b.WriteString("## Warnings\n\n")
 		for _, warning := range result.Warnings {
 			fmt.Fprintf(&b, "- %s\n", warning)
 		}
 		b.WriteString("\n")
 	}

-	fmt.Fprintf(&b, "Per GPU Scorecard\n")
-	fmt.Fprintf(&b, "-----------------\n")
+	// ── Scorecard table ───────────────────────────────────────────────────────
+	b.WriteString("## Scorecard\n\n")
+	b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
+	b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
 	for _, gpu := range result.GPUs {
-		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
-		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
-		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
-		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
-		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
-		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
-		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
+		name := strings.TrimSpace(gpu.Name)
+		if name == "" {
+			name = "Unknown GPU"
+		}
+		interconnect := "-"
 		if gpu.Scores.InterconnectScore > 0 {
-			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
+			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
 		}
-		if len(gpu.DegradationReasons) > 0 {
-			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
+		topsPerSM := "-"
+		if gpu.Scores.TOPSPerSMPerGHz > 0 {
+			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
 		}
-		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
-		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
-		if len(gpu.PrecisionResults) > 0 {
-			fmt.Fprintf(&b, "  Precision results:\n")
-			for _, precision := range gpu.PrecisionResults {
-				if precision.Supported {
-					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
-				} else {
-					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
-				}
-			}
-		}
-		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
-			gpu.Throttle.SWPowerCapUS,
-			gpu.Throttle.SWThermalSlowdownUS,
-			gpu.Throttle.SyncBoostUS,
-			gpu.Throttle.HWThermalSlowdownUS,
-			gpu.Throttle.HWPowerBrakeSlowdownUS,
+		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
+			gpu.Index, name,
+			gpu.Status,
+			gpu.Scores.CompositeScore,
+			gpu.Scores.ComputeScore,
+			topsPerSM,
+			gpu.Scores.PowerSustainScore,
+			gpu.Scores.ThermalSustainScore,
+			gpu.Scores.StabilityScore,
+			interconnect,
 		)
-		if len(gpu.Notes) > 0 {
-			fmt.Fprintf(&b, "  Notes:\n")
-			for _, note := range gpu.Notes {
-				fmt.Fprintf(&b, "    - %s\n", note)
-			}
+	}
+	b.WriteString("\n")
+
+	// ── Per GPU detail ────────────────────────────────────────────────────────
+	b.WriteString("## Per-GPU Details\n\n")
+	for _, gpu := range result.GPUs {
+		name := strings.TrimSpace(gpu.Name)
+		if name == "" {
+			name = "Unknown GPU"
+		}
+		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
+
+		// Identity
+		if gpu.BusID != "" {
+			fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
+		}
+		if gpu.VBIOS != "" {
+			fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
+		}
+		if gpu.ComputeCapability != "" {
+			fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
+		}
+		if gpu.MultiprocessorCount > 0 {
+			fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
+		}
+		if gpu.PowerLimitW > 0 {
+			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
+		}
+		if gpu.LockedGraphicsClockMHz > 0 {
+			fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
 		}
 		b.WriteString("\n")
+
+		// Steady-state telemetry
+		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
+		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
+		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
+		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
+		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
+		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+		b.WriteString("\n")
+
+		// Throttle
+		throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
+		if throttle != "none" {
+			fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
+		}
+
+		// Precision results
+		if len(gpu.PrecisionResults) > 0 {
+			b.WriteString("**Precision results:**\n\n")
+			b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
+			for _, p := range gpu.PrecisionResults {
+				if p.Supported {
+					fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
+				} else {
+					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
+				}
+			}
+			b.WriteString("\n")
+		}
+
+		// Degradation / Notes
+		if len(gpu.DegradationReasons) > 0 {
+			fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
+		}
+		if len(gpu.Notes) > 0 {
+			b.WriteString("**Notes:**\n\n")
+			for _, note := range gpu.Notes {
+				fmt.Fprintf(&b, "- %s\n", note)
+			}
+			b.WriteString("\n")
+		}
 	}

+	// ── Interconnect ──────────────────────────────────────────────────────────
 	if result.Interconnect != nil {
-		fmt.Fprintf(&b, "Interconnect\n")
-		fmt.Fprintf(&b, "------------\n")
-		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
+		b.WriteString("## Interconnect (NCCL)\n\n")
+		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
-			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
-			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
+			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
+			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
-		b.WriteString("\n")
+		if len(result.Interconnect.Notes) > 0 {
+			b.WriteString("\n")
+		}
 	}

-	fmt.Fprintf(&b, "Methodology\n")
-	fmt.Fprintf(&b, "-----------\n")
-	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
-	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
-	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
+	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	if sp := result.ServerPower; sp != nil {
+		b.WriteString("## Server Power (IPMI)\n\n")
+		if !sp.Available {
+			b.WriteString("IPMI power measurement unavailable.\n\n")
+		} else {
+			b.WriteString("| | Value |\n|---|---|\n")
+			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
+			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
+			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
+			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
+			if sp.ReportingRatio > 0 {
+				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
+			}
+			b.WriteString("\n")
+		}
+		for _, note := range sp.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		if len(sp.Notes) > 0 {
+			b.WriteString("\n")
+		}
+	}

-	fmt.Fprintf(&b, "Raw Files\n")
-	fmt.Fprintf(&b, "---------\n")
-	fmt.Fprintf(&b, "- result.json\n")
-	fmt.Fprintf(&b, "- report.txt\n")
-	fmt.Fprintf(&b, "- summary.txt\n")
-	fmt.Fprintf(&b, "- verbose.log\n")
-	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
-	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
-	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
-	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
-	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
+	// ── Terminal charts (steady-state only) ───────────────────────────────────
+	if len(charts) > 0 {
+		b.WriteString("## Steady-State Charts\n\n")
+		for _, chart := range charts {
+			content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
+			if content == "" {
+				continue
+			}
+			fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
+		}
+	}
+
+	// ── Methodology ───────────────────────────────────────────────────────────
+	b.WriteString("## Methodology\n\n")
+	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
+	b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
+	b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
+	b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
+
+	// ── Raw files ─────────────────────────────────────────────────────────────
+	b.WriteString("## Raw Files\n\n")
+	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
+	b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
+	b.WriteString("- `gpu-*-warmup.log`\n")
+	b.WriteString("- `gpu-*-steady.log`\n")
+	b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
+	b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
 	if result.Interconnect != nil {
-		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
+		b.WriteString("- `nccl-all-reduce.log`\n")
 	}
 	return b.String()
 }

+// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
+// cooldown charts are not useful for human review).
+func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
+	var charts []benchmarkReportChart
+	for _, idx := range gpuIndices {
+		path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
+		raw, err := os.ReadFile(path)
+		if err != nil || len(raw) == 0 {
+			continue
+		}
+		charts = append(charts, benchmarkReportChart{
+			Title:   fmt.Sprintf("GPU %d — Steady State", idx),
+			Content: string(raw),
+		})
+	}
+	return charts
+}
+
+func stripANSIEscapeSequences(raw string) string {
+	return ansiEscapePattern.ReplaceAllString(raw, "")
+}
+
+// formatThrottleLine renders throttle counters as human-readable percentages of
+// the steady-state window.  Only non-zero counters are shown.  When the steady
+// duration is unknown (0), raw seconds are shown instead.
+func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
+	type counter struct {
+		label string
+		us    uint64
+	}
+	counters := []counter{
+		{"sw_power", t.SWPowerCapUS},
+		{"sw_thermal", t.SWThermalSlowdownUS},
+		{"sync_boost", t.SyncBoostUS},
+		{"hw_thermal", t.HWThermalSlowdownUS},
+		{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
+	}
+	var parts []string
+	for _, c := range counters {
+		if c.us == 0 {
+			continue
+		}
+		sec := float64(c.us) / 1e6
+		if steadyDurationSec > 0 {
+			pct := sec / steadyDurationSec * 100
+			parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
+		} else if sec < 1 {
+			parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
+		} else {
+			parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
+		}
+	}
+	if len(parts) == 0 {
+		return "none"
+	}
+	return strings.Join(parts, "  ")
+}
+
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -137,11 +137,44 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 	for _, needle := range []string{
 		"Executive Summary",
 		"GPU 0 spent measurable time under SW power cap.",
-		"Composite score: 1176.00",
-		"fp16_tensor: 700.00 TOPS",
+		"1176.00",
+		"fp16_tensor",
+		"700.00",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
 		}
 	}
 }
+
+func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
+	t.Parallel()
+
+	report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "OK",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "full",
+		},
+	}, []benchmarkReportChart{
+		{
+			Title:   "GPU 0 Steady State",
+			Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
+		},
+	})
+
+	for _, needle := range []string{
+		"Steady-State Charts",
+		"GPU 0 Steady State",
+		"GPU 0 chart",
+		"42┤───",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+	if strings.Contains(report, "\x1b[31m") {
+		t.Fatalf("report should not contain ANSI escapes\n%s", report)
+	}
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -14,13 +14,17 @@ type NvidiaBenchmarkOptions struct {
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
+	ParallelGPUs      bool // run all selected GPUs simultaneously instead of sequentially
 }

+
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion   string                       `json:"benchmark_version"`
 	GeneratedAt        time.Time                    `json:"generated_at"`
 	Hostname           string                       `json:"hostname,omitempty"`
+	ServerModel        string                       `json:"server_model,omitempty"`
 	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`
@@ -28,6 +32,7 @@ type NvidiaBenchmarkResult struct {
 	Normalization      BenchmarkNormalization       `json:"normalization"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -56,7 +61,10 @@ type BenchmarkGPUResult struct {
 	Backend                string                     `json:"backend,omitempty"`
 	Status                 string                     `json:"status"`
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
+	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
+	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
+	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
 	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
 	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
@@ -117,6 +125,24 @@ type BenchmarkScorecard struct {
 	StabilityScore      float64 `json:"stability_score"`
 	InterconnectScore   float64 `json:"interconnect_score"`
 	CompositeScore      float64 `json:"composite_score"`
+	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
+	// Comparable across throttle levels and GPU generations. Low value at normal
+	// clocks indicates silicon degradation.
+	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
+// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
+// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
+// over-reporting its power consumption.
+type BenchmarkServerPower struct {
+	Available       bool     `json:"available"`
+	IdleW           float64  `json:"idle_w,omitempty"`
+	LoadedW         float64  `json:"loaded_w,omitempty"`
+	DeltaW          float64  `json:"delta_w,omitempty"`
+	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
+	Notes           []string `json:"notes,omitempty"`
 }

 type BenchmarkInterconnectResult struct {
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 }

 const (
-	ansiRed    = "\033[31m"
-	ansiBlue   = "\033[34m"
-	ansiGreen  = "\033[32m"
-	ansiYellow = "\033[33m"
+	ansiAmber  = "\033[38;5;214m"
 	ansiReset  = "\033[0m"
 )

@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
 		fn      func(GPUMetricRow) float64
 	}
 	defs := []seriesDef{
-		{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
-		{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
-		{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
-		{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
+		{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
+		{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
+		{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
+		{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
 	}

 	var b strings.Builder
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -116,25 +116,47 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 	if err := ctx.Err(); err != nil {
 		return err
 	}
-	if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
-		log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
+
+	mediumRebound := false
+	if err := bindMount(dstDir, "/run/live/medium"); err != nil {
+		log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
+	} else {
+		mediumRebound = true
 	}

 	log("Verifying live medium now served from RAM...")
 	status := s.LiveBootSource()
-	if err := verifyInstallToRAMStatus(status); err != nil {
+	if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
 		return err
 	}
-	log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
-	log("Done. Installation media can be safely disconnected.")
+	if status.InRAM {
+		log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
+	}
+	log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
 	return nil
 }

-func verifyInstallToRAMStatus(status LiveBootSource) error {
+func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
 	if status.InRAM {
 		return nil
 	}
-	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
+
+	// The live medium mount was not redirected to RAM. This is expected when
+	// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
+	// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
+	// because the CD-ROM mount is in use. Check whether files were at least
+	// copied to the tmpfs directory — that is sufficient for safe disconnection
+	// once the kernel has paged in all actively-used data.
+	files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
+	if len(files) > 0 {
+		if !mediumRebound {
+			log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
+			log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
+		}
+		return nil
+	}
+
+	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
 }

 func describeLiveBootSource(status LiveBootSource) string {
@@ -247,7 +269,31 @@ func findLoopForFile(backingFile string) (string, error) {
 	return "", fmt.Errorf("no loop device found for %s", backingFile)
 }

+// loopDeviceOffset returns the byte offset configured for the loop device,
+// or -1 if it cannot be determined.
+func loopDeviceOffset(loopDev string) int64 {
+	out, err := exec.Command("losetup", "--json", loopDev).Output()
+	if err != nil {
+		return -1
+	}
+	var result struct {
+		Loopdevices []struct {
+			Offset int64 `json:"offset"`
+		} `json:"loopdevices"`
+	}
+	if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
+		return -1
+	}
+	return result.Loopdevices[0].Offset
+}
+
 func reassociateLoopDevice(loopDev, newFile string) error {
+	// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
+	// typically set up with a non-zero offset (squashfs lives inside the ISO),
+	// so the ioctl returns EINVAL. Detect this early for a clear error message.
+	if off := loopDeviceOffset(loopDev); off > 0 {
+		return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
+	}
 	if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
 		return nil
 	}
--- a/audit/internal/platform/install_to_ram_linux.go
+++ b/audit/internal/platform/install_to_ram_linux.go
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
 	}
 	return nil
 }
+
+// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
+func bindMount(src, dst string) error {
+	return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
+}
--- a/audit/internal/platform/install_to_ram_other.go
+++ b/audit/internal/platform/install_to_ram_other.go
@@ -7,3 +7,7 @@ import "errors"
 func loopChangeFD(loopDev, newFile string) error {
 	return errors.New("LOOP_CHANGE_FD not available on this platform")
 }
+
+func bindMount(src, dst string) error {
+	return errors.New("bind mount not available on this platform")
+}
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -33,14 +33,17 @@ func TestInferLiveBootKind(t *testing.T) {
 func TestVerifyInstallToRAMStatus(t *testing.T) {
 	t.Parallel()

-	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
+	dstDir := t.TempDir()
+
+	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
 		t.Fatalf("expected success for RAM-backed status, got %v", err)
 	}
-	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
+
+	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
 	if err == nil {
 		t.Fatal("expected verification failure when media is still on USB")
 	}
-	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
+	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
 		t.Fatalf("error=%q", got)
 	}
 }
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -15,6 +15,10 @@ var workerPatterns = []string{
 	"stress-ng",
 	"stressapptest",
 	"memtester",
+	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
+	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
+	"nvvs",
+	"dcgmi",
 }

 // KilledProcess describes a process that was sent SIGKILL.
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
 		return "", err
 	}

-	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
+	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
 		job,
-		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func nvidiaStressArchivePrefix(loader string) string {
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"--seconds", strconv.Itoa(opts.DurationSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
+		if opts.StaggerSeconds > 0 && len(selected) > 1 {
+			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
+		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"bee-john-gpu-stress",
 			"--seconds", strconv.Itoa(opts.DurationSec),
 		}
+		if opts.StaggerSeconds > 0 && len(selected) > 1 {
+			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
+		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
-				gpuCmd := buildGPUStressCmd(loadCtx, vendor)
+				gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
 				if gpuCmd == nil {
 					return
 				}
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
 	}
 	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)

-	// Pack tar.gz
-	archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
-	if err := packPlatformDir(runDir, archivePath); err != nil {
-		return "", fmt.Errorf("pack archive: %w", err)
-	}
-	_ = os.RemoveAll(runDir)
-	return archivePath, nil
+	return runDir, nil
 }

 // collectPhase samples live metrics every second until ctx is done.
@@ -392,6 +386,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
 		cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
 	}
 	cmd := exec.CommandContext(ctx, path, cmdArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	if err := startLowPriorityCmd(cmd, 15); err != nil {
@@ -402,28 +403,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {

 // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
 // Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
-func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
+func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
 	switch strings.ToLower(vendor) {
 	case "amd":
-		return buildAMDGPUStressCmd(ctx)
+		return buildAMDGPUStressCmd(ctx, durSec)
 	case "nvidia":
-		return buildNvidiaGPUStressCmd(ctx)
+		return buildNvidiaGPUStressCmd(ctx, durSec)
 	}
 	return nil
 }

-func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	rvsArgs, err := resolveRVSCommand()
 	if err != nil {
 		return nil
 	}
 	rvsPath := rvsArgs[0]
-	cfg := `actions:
+	cfg := fmt.Sprintf(`actions:
 - name: gst_platform
  device: all
  module: gst
  parallel: true
-  duration: 86400000
+  duration: %d`, durSec*1000) + `
  copy_matrix: false
  target_stress: 90
  matrix_size_a: 8640
@@ -433,13 +434,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
 	cfgFile := "/tmp/bee-platform-gst.conf"
 	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
 	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
 	return cmd
 }

-func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	path, err := satLookPath("bee-gpu-burn")
 	if err != nil {
 		path, err = satLookPath("bee-gpu-stress")
@@ -447,7 +455,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
 	if err != nil {
 		return nil
 	}
-	cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
+	// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
+	// Process group kill via Setpgid+Cancel is kept as a safety net for cases
+	// where the context is cancelled early (user stop, parent timeout).
+	cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -1,6 +1,7 @@
 package platform

 import (
+	"bufio"
 	"os"
 	"os/exec"
 	"strings"
@@ -114,6 +115,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 	}

 	s.collectGPURuntimeHealth(vendor, &health)
+	s.collectToRAMHealth(&health)
+	s.collectUSBExportHealth(&health)

 	if health.Status != "FAILED" && len(health.Issues) > 0 {
 		health.Status = "PARTIAL"
@@ -168,11 +171,111 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
 	return ToolStatus{Name: display}
 }

+// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
+// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
+// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
+func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
+	inRAM := s.IsLiveMediaInRAM()
+	active := toramActive()
+	switch {
+	case inRAM:
+		health.ToRAMStatus = "ok"
+	case active:
+		// toram was requested but medium is not yet/no longer in RAM
+		health.ToRAMStatus = "failed"
+		health.Issues = append(health.Issues, schema.RuntimeIssue{
+			Code:        "toram_copy_failed",
+			Severity:    "warning",
+			Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
+		})
+	default:
+		health.ToRAMStatus = "warning"
+	}
+}
+
+// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
+// suitable for log export. Sets USBExportPath to the first match found.
+func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
+	health.USBExportPath = findUSBExportMount()
+}
+
+// findUSBExportMount returns the mount point of the first writable USB filesystem
+// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
+// has USB transport. Returns "" if none found.
+func findUSBExportMount() string {
+	f, err := os.Open("/proc/mounts")
+	if err != nil {
+		return ""
+	}
+	defer f.Close()
+
+	// fs types that are expected on USB export drives
+	exportFSTypes := map[string]bool{
+		"vfat":  true,
+		"exfat": true,
+		"ext2":  true,
+		"ext3":  true,
+		"ext4":  true,
+		"ntfs":  true,
+		"ntfs3": true,
+		"fuseblk": true,
+	}
+
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		// fields: device mountpoint fstype options dump pass
+		fields := strings.Fields(scanner.Text())
+		if len(fields) < 4 {
+			continue
+		}
+		device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
+		if !exportFSTypes[strings.ToLower(fsType)] {
+			continue
+		}
+		// Skip read-only mounts
+		opts := strings.Split(options, ",")
+		readOnly := false
+		for _, o := range opts {
+			if strings.TrimSpace(o) == "ro" {
+				readOnly = true
+				break
+			}
+		}
+		if readOnly {
+			continue
+		}
+		// Check USB transport via lsblk on the device
+		if !strings.HasPrefix(device, "/dev/") {
+			continue
+		}
+		if blockDeviceTransport(device) == "usb" {
+			return mountPoint
+		}
+	}
+	return ""
+}
+
 func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
 	lsmodText := commandText("lsmod")

 	switch vendor {
 	case "nvidia":
+		if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
+			health.NvidiaGSPMode = strings.TrimSpace(string(raw))
+			if health.NvidiaGSPMode == "gsp-stuck" {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "nvidia_gsp_stuck",
+					Severity:    "critical",
+					Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
+				})
+			} else if health.NvidiaGSPMode == "gsp-off" {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "nvidia_gsp_disabled",
+					Severity:    "warning",
+					Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
+				})
+			}
+		}
 		health.DriverReady = strings.Contains(lsmodText, "nvidia ")
 		if !health.DriverReady {
 			health.Issues = append(health.Issues, schema.RuntimeIssue{
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -88,6 +88,37 @@ type NvidiaGPU struct {
 	MemoryMB int    `json:"memory_mb"`
 }

+type NvidiaGPUStatus struct {
+	Index        int    `json:"index"`
+	Name         string `json:"name"`
+	BDF          string `json:"bdf,omitempty"`
+	Serial       string `json:"serial,omitempty"`
+	Status       string `json:"status"`
+	RawLine      string `json:"raw_line,omitempty"`
+	NeedsReset   bool   `json:"needs_reset"`
+	ParseFailure bool   `json:"parse_failure,omitempty"`
+}
+
+type nvidiaGPUHealth struct {
+	Index        int
+	Name         string
+	NeedsReset   bool
+	RawLine      string
+	ParseFailure bool
+}
+
+type nvidiaGPUStatusFile struct {
+	Index       int
+	Name        string
+	RunStatus   string
+	Reason      string
+	Health      string
+	HealthRaw   string
+	Observed    bool
+	Selected    bool
+	FailingJob  string
+}
+
 // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
 type AMDGPUInfo struct {
 	Index int    `json:"index"`
@@ -269,6 +300,72 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
 	return gpus, nil
 }

+func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
+	out, err := satExecCommand(
+		"nvidia-smi",
+		"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
+		"--format=csv,noheader,nounits",
+	).Output()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi: %w", err)
+	}
+	var gpus []NvidiaGPUStatus
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		parts := strings.Split(line, ",")
+		if len(parts) < 4 {
+			gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
+			continue
+		}
+		idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+		if err != nil {
+			gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
+			continue
+		}
+		upper := strings.ToUpper(line)
+		needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
+		status := "OK"
+		if needsReset {
+			status = "RESET_REQUIRED"
+		}
+		gpus = append(gpus, NvidiaGPUStatus{
+			Index:      idx,
+			Name:       strings.TrimSpace(parts[1]),
+			BDF:        normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
+			Serial:     strings.TrimSpace(parts[3]),
+			Status:     status,
+			RawLine:    line,
+			NeedsReset: needsReset,
+		})
+	}
+	sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
+	return gpus, nil
+}
+
+func normalizeNvidiaBusID(v string) string {
+	v = strings.TrimSpace(strings.ToLower(v))
+	parts := strings.Split(v, ":")
+	if len(parts) == 3 && len(parts[0]) > 4 {
+		parts[0] = parts[0][len(parts[0])-4:]
+		return strings.Join(parts, ":")
+	}
+	return v
+}
+
+func (s *System) ResetNvidiaGPU(index int) (string, error) {
+	if index < 0 {
+		return "", fmt.Errorf("gpu index must be >= 0")
+	}
+	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
+	if len(raw) == 0 && err == nil {
+		raw = []byte("GPU reset completed.\n")
+	}
+	return string(raw), err
+}
+
 // RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
 func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
@@ -278,36 +375,50 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-all-reduce-perf.log", cmd: []string{
+	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
 		}},
-	}, logFunc)
+	), logFunc)
 }

-func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
-	profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
-	if err != nil {
-		return "", err
+	var (
+		profCmd []string
+		profEnv []string
+	)
+	if staggerSec > 0 && len(selected) > 1 {
+		profCmd = []string{
+			"bee-dcgmproftester-staggered",
+			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
+			"--stagger-seconds", strconv.Itoa(staggerSec),
+			"--devices", joinIndexList(selected),
+		}
+	} else {
+		profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
+		if err != nil {
+			return "", err
+		}
+		profEnv = nvidiaVisibleDevicesEnv(selected)
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
-		{
-			name:       "03-dcgmproftester.log",
-			cmd:        profCmd,
-			env:        nvidiaVisibleDevicesEnv(selected),
-			collectGPU: true,
-			gpuIndices: selected,
-		},
-		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
+			satJob{
+				name:       "03-dcgmproftester.log",
+				cmd:        profCmd,
+				env:        profEnv,
+				collectGPU: true,
+				gpuIndices: selected,
+			},
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -315,16 +426,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-targeted-power.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -332,16 +443,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-pulse-test.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -349,16 +460,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-nvbandwidth.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
@@ -382,16 +493,23 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-targeted-stress.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
@@ -427,9 +545,13 @@ func memoryStressSizeArg() string {
 	return fmt.Sprintf("%dM", targetMB)
 }

-func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-	sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
-	passes := envInt("BEE_MEMTESTER_PASSES", 1)
+func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
+	if sizeMB <= 0 {
+		sizeMB = 256
+	}
+	if passes <= 0 {
+		passes = 1
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
@@ -486,7 +608,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
 	}, logFunc)
 }

-func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if baseDir == "" {
 		baseDir = "/var/log/bee-sat"
 	}
@@ -518,7 +640,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
 			break
 		}
 		prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
-		commands := storageSATCommands(devPath)
+		commands := storageSATCommands(devPath, extended)
 		for cmdIndex, job := range commands {
 			if ctx.Err() != nil {
 				break
@@ -540,11 +662,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
-	archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", err
-	}
-	return archive, nil
+	return runDir, nil
 }

 type satJob struct {
@@ -561,14 +679,24 @@ type satStats struct {
 	Unsupported int
 }

+func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
+	out := make([]satJob, 0, len(jobs)+1)
+	out = append(out, satJob{
+		name: "00-nvidia-smi-persistence-mode.log",
+		cmd:  []string{"nvidia-smi", "-pm", "1"},
+	})
+	out = append(out, jobs...)
+	return out
+}
+
 func nvidiaSATJobs() []satJob {
-	return []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
-		{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
-	}
+	return withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
+		satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
+	)
 }

 func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
@@ -583,12 +711,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 		}
 		diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
 	}
-	return []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-dcgmi-diag.log", cmd: diagArgs},
-	}
+	return withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
+	)
 }

 func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
@@ -613,7 +741,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
 	if len(gpuIndices) == 0 {
 		return nil
 	}
-	return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	return []string{
+		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
+		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
+	}
 }

 func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
@@ -632,11 +763,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa

 	var summary strings.Builder
 	stats := satStats{}
+	nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
+	perGPU := map[int]*nvidiaGPUStatusFile{}
+	selectedGPUIndices := map[int]struct{}{}
 	fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	for _, job := range jobs {
 		if ctx.Err() != nil {
 			break
 		}
+		for _, idx := range job.gpuIndices {
+			selectedGPUIndices[idx] = struct{}{}
+			status := perGPU[idx]
+			if status == nil {
+				status = &nvidiaGPUStatusFile{Index: idx}
+				perGPU[idx] = status
+			}
+			status.Selected = true
+		}
 		cmd := make([]string, 0, len(job.cmd))
 		for _, arg := range job.cmd {
 			cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
@@ -645,17 +788,52 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		var out []byte
 		var err error

-		if job.collectGPU {
-			out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
-		} else {
-			out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
+		if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
+			if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
+				if logFunc != nil {
+					logFunc(msg)
+				}
+				out = []byte(msg + "\n")
+				err = healthErr
+			}
+		}
+
+		if err == nil {
+			if job.collectGPU {
+				out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
+			} else {
+				out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
+			}
+		}
+
+		if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
+			if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
+				if logFunc != nil {
+					logFunc(msg)
+				}
+				if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
+					out = append(out, '\n')
+				}
+				out = append(out, []byte(msg+"\n")...)
+				if err == nil {
+					err = healthErr
+				}
+			}
 		}

 		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
 			return "", writeErr
 		}
+		if ctx.Err() != nil {
+			return "", ctx.Err()
+		}
 		status, rc := classifySATResult(job.name, out, err)
 		stats.Add(status)
+		if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
+			for _, idx := range job.gpuIndices {
+				updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
+			}
+		}
 		key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
 		fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
 		fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
@@ -664,12 +842,204 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
-
-	archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", err
+	if nvidiaPack {
+		if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
+			return "", err
+		}
 	}
-	return archive, nil
+
+	return runDir, nil
+}
+
+func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
+	entry := perGPU[idx]
+	if entry == nil {
+		entry = &nvidiaGPUStatusFile{Index: idx}
+		perGPU[idx] = entry
+	}
+	if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
+		entry.RunStatus = status
+		entry.FailingJob = jobName
+		entry.Reason = firstLine(detail)
+	}
+}
+
+func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
+	health, err := readNvidiaGPUHealth()
+	if err == nil {
+		for _, gpu := range health {
+			entry := perGPU[gpu.Index]
+			if entry == nil {
+				entry = &nvidiaGPUStatusFile{Index: gpu.Index}
+				perGPU[gpu.Index] = entry
+			}
+			entry.Name = gpu.Name
+			entry.Observed = true
+			entry.HealthRaw = gpu.RawLine
+			if gpu.NeedsReset {
+				entry.Health = "RESET_REQUIRED"
+				if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
+					entry.RunStatus = "FAILED"
+					if strings.TrimSpace(entry.Reason) == "" {
+						entry.Reason = "GPU requires reset"
+					}
+				}
+			} else {
+				entry.Health = "OK"
+			}
+		}
+	}
+	for idx := range selected {
+		entry := perGPU[idx]
+		if entry == nil {
+			entry = &nvidiaGPUStatusFile{Index: idx}
+			perGPU[idx] = entry
+		}
+		entry.Selected = true
+	}
+	var indices []int
+	for idx := range perGPU {
+		indices = append(indices, idx)
+	}
+	sort.Ints(indices)
+	for _, idx := range indices {
+		entry := perGPU[idx]
+		if entry.RunStatus == "" {
+			entry.RunStatus = overall
+		}
+		if entry.Health == "" {
+			entry.Health = "UNKNOWN"
+		}
+		if entry.Name == "" {
+			entry.Name = "Unknown GPU"
+		}
+		var body strings.Builder
+		fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
+		fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
+		fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
+		fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
+		fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
+		fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
+		if strings.TrimSpace(entry.FailingJob) != "" {
+			fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
+		}
+		if strings.TrimSpace(entry.Reason) != "" {
+			fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
+		}
+		if strings.TrimSpace(entry.HealthRaw) != "" {
+			fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
+		}
+		if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func nvidiaSATStatusSeverity(status string) int {
+	switch strings.ToUpper(strings.TrimSpace(status)) {
+	case "FAILED":
+		return 3
+	case "PARTIAL", "UNSUPPORTED":
+		return 2
+	case "OK":
+		return 1
+	default:
+		return 0
+	}
+}
+
+func firstLine(s string) string {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return ""
+	}
+	if idx := strings.IndexByte(s, '\n'); idx >= 0 {
+		return strings.TrimSpace(s[:idx])
+	}
+	return s
+}
+
+func nvidiaJobNeedsHealthCheck(job satJob) bool {
+	if job.collectGPU {
+		return true
+	}
+	name := strings.ToLower(strings.TrimSpace(job.name))
+	return strings.Contains(name, "dcgmi") ||
+		strings.Contains(name, "gpu-burn") ||
+		strings.Contains(name, "gpu-stress") ||
+		strings.Contains(name, "dcgmproftester")
+}
+
+func checkNvidiaJobHealth(selected []int) (string, error) {
+	health, err := readNvidiaGPUHealth()
+	if err != nil {
+		return "", nil
+	}
+	var bad []nvidiaGPUHealth
+	selectedSet := make(map[int]struct{}, len(selected))
+	for _, idx := range selected {
+		selectedSet[idx] = struct{}{}
+	}
+	for _, gpu := range health {
+		if len(selectedSet) > 0 {
+			if _, ok := selectedSet[gpu.Index]; !ok {
+				continue
+			}
+		}
+		if gpu.NeedsReset {
+			bad = append(bad, gpu)
+		}
+	}
+	if len(bad) == 0 {
+		return "", nil
+	}
+	lines := make([]string, 0, len(bad)+1)
+	lines = append(lines, "NVIDIA GPU health check failed:")
+	for _, gpu := range bad {
+		lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
+	}
+	return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
+}
+
+func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
+	out, err := satExecCommand(
+		"nvidia-smi",
+		"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
+		"--format=csv,noheader,nounits",
+	).Output()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi: %w", err)
+	}
+	return parseNvidiaGPUHealth(string(out)), nil
+}
+
+func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
+	var gpus []nvidiaGPUHealth
+	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		parts := strings.Split(line, ",")
+		if len(parts) < 2 {
+			gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
+			continue
+		}
+		idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+		if err != nil {
+			gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
+			continue
+		}
+		upper := strings.ToUpper(line)
+		gpus = append(gpus, nvidiaGPUHealth{
+			Index:      idx,
+			Name:       strings.TrimSpace(parts[1]),
+			NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
+			RawLine:    line,
+		})
+	}
+	return gpus
 }

 func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
@@ -726,17 +1096,25 @@ func listStorageDevices() ([]string, error) {
 	return parseStorageDevices(string(out)), nil
 }

-func storageSATCommands(devPath string) []satJob {
+func storageSATCommands(devPath string, extended bool) []satJob {
 	if strings.Contains(filepath.Base(devPath), "nvme") {
+		selfTestLevel := "1"
+		if extended {
+			selfTestLevel = "2"
+		}
 		return []satJob{
 			{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
 			{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
-			{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
+			{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
 		}
 	}
+	smartTestType := "short"
+	if extended {
+		smartTestType = "long"
+	}
 	return []satJob{
 		{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
-		{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
+		{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
 	}
 }

@@ -795,6 +1173,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 		// nvidia-smi on a machine with no NVIDIA GPU
 		strings.Contains(text, "couldn't communicate with the nvidia driver") ||
 		strings.Contains(text, "no nvidia gpu") ||
+		// Some NVMe firmwares start self-test but never expose progress to nvme-cli
+		// while waiting, so the CLI stops polling without proving device failure.
+		(strings.Contains(name, "self-test") &&
+			strings.Contains(text, "no progress for") &&
+			strings.Contains(text, "stop waiting")) ||
 		(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
 		return "UNSUPPORTED", rc
 	}
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -20,7 +20,7 @@ type FanStressOptions struct {
 	Phase1DurSec int   // first load phase duration in seconds (default 300)
 	PauseSec     int   // pause between the two load phases (default 60)
 	Phase2DurSec int   // second load phase duration in seconds (default 300)
-	SizeMB       int   // GPU memory to allocate per GPU during stress (default 64)
+	SizeMB       int   // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
 	GPUIndices   []int // which GPU indices to stress (empty = all detected)
 }

@@ -223,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		return "", err
 	}

-	archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", err
-	}
-	return archive, nil
+	return runDir, nil
 }

 func applyFanStressDefaults(opts *FanStressOptions) {
@@ -243,9 +239,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
 	if opts.Phase2DurSec <= 0 {
 		opts.Phase2DurSec = 300
 	}
-	if opts.SizeMB <= 0 {
-		opts.SizeMB = 64
-	}
+	// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
+	// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
 }

 // sampleFanStressRow collects all metrics for one telemetry sample.
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -1,23 +1,25 @@
 package platform

 import (
+	"context"
 	"errors"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )

 func TestStorageSATCommands(t *testing.T) {
 	t.Parallel()

-	nvme := storageSATCommands("/dev/nvme0n1")
+	nvme := storageSATCommands("/dev/nvme0n1", false)
 	if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
 		t.Fatalf("unexpected nvme commands: %#v", nvme)
 	}

-	sata := storageSATCommands("/dev/sda")
+	sata := storageSATCommands("/dev/sda", false)
 	if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
 		t.Fatalf("unexpected sata commands: %#v", sata)
 	}
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {

 	jobs := nvidiaSATJobs()

-	if len(jobs) != 5 {
-		t.Fatalf("jobs=%d want 5", len(jobs))
+	if len(jobs) != 6 {
+		t.Fatalf("jobs=%d want 6", len(jobs))
 	}
-	if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
+	if got := jobs[0].cmd[0]; got != "nvidia-smi" {
+		t.Fatalf("preflight command=%q want nvidia-smi", got)
+	}
+	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+	}
+	if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
 		t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
 	}
-	if got := jobs[3].cmd[1]; got != "--output-file" {
+	if got := jobs[4].cmd[1]; got != "--output-file" {
 		t.Fatalf("bug report flag=%q want --output-file", got)
 	}
 }
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {

 func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	jobs := nvidiaSATJobs()
-	got := jobs[4].cmd
+	got := jobs[5].cmd
 	want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
 	if len(got) != len(want) {
 		t.Fatalf("cmd len=%d want %d", len(got), len(want))
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
+	jobs := nvidiaDCGMJobs(3, []int{2, 0})
+	if len(jobs) != 5 {
+		t.Fatalf("jobs=%d want 5", len(jobs))
+	}
+	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+	}
+	if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
+		t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
+	}
+}
+
 func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
 	t.Parallel()

@@ -195,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
 	}
 }

+func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
+	t.Parallel()
+
+	got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
+	if len(got) != 2 {
+		t.Fatalf("len=%d want 2", len(got))
+	}
+	if got[0].NeedsReset {
+		t.Fatalf("gpu0 unexpectedly marked reset-required")
+	}
+	if !got[1].NeedsReset {
+		t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
+	}
+}
+
+func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	msg, err := checkNvidiaJobHealth([]int{1})
+	if err == nil {
+		t.Fatal("expected health check error")
+	}
+	if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
+		t.Fatalf("unexpected message: %q", msg)
+	}
+}
+
+func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
+	dir := t.TempDir()
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	perGPU := map[int]*nvidiaGPUStatusFile{
+		0: {Index: 0, RunStatus: "OK"},
+		1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
+	}
+	if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
+		t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
+	}
+	raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
+	if err != nil {
+		t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
+	}
+	text := string(raw)
+	if !strings.Contains(text, "run_status=FAILED") {
+		t.Fatalf("missing run status:\n%s", text)
+	}
+	if !strings.Contains(text, "health_status=RESET_REQUIRED") {
+		t.Fatalf("missing health status:\n%s", text)
+	}
+	if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
+		t.Fatalf("missing failing job:\n%s", text)
+	}
+}
+
 func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
 	oldLookPath := satLookPath
 	satLookPath = func(file string) (string, error) {
@@ -234,11 +323,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {

 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
-	if len(env) != 1 {
-		t.Fatalf("env len=%d want 1 (%v)", len(env), env)
+	if len(env) != 2 {
+		t.Fatalf("env len=%d want 2 (%v)", len(env), env)
 	}
-	if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
-		t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
+	if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
+		t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
+	}
+	if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
+		t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
 	}
 }

@@ -317,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
 	}{
 		{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
 		{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
+		{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 		{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
 		{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 	}
@@ -331,6 +424,38 @@ func TestClassifySATResult(t *testing.T) {
 	}
 }

+func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
+	dir := t.TempDir()
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+
+	done := make(chan struct{})
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		cancel()
+		close(done)
+	}()
+
+	archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
+		{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
+	}, nil)
+	<-done
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("err=%v want context.Canceled", err)
+	}
+	if archive != "" {
+		t.Fatalf("archive=%q want empty", archive)
+	}
+	matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
+	if globErr != nil {
+		t.Fatalf("Glob error: %v", globErr)
+	}
+	if len(matches) != 0 {
+		t.Fatalf("archives=%v want none", matches)
+	}
+}
+
 func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -61,7 +61,9 @@ func (s *System) ServiceState(name string) string {
 }

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
-	raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
+	// bee-web runs as the bee user; sudo is required to control system services.
+	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
+	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
 	return string(raw), err
 }

--- a/audit/internal/platform/techdump.go
+++ b/audit/internal/platform/techdump.go
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
 	{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
 	{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
 	{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
+	{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
 	{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
 	{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
 	{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
 	Loader            string
 	GPUIndices        []int
 	ExcludeGPUIndices []int
+	StaggerSeconds    int
 }

 func New() *System {
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -20,7 +20,12 @@ type RuntimeHealth struct {
 	ExportDir     string                 `json:"export_dir,omitempty"`
 	DriverReady   bool                   `json:"driver_ready,omitempty"`
 	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
+	NvidiaGSPMode string                 `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string                 `json:"network_status,omitempty"`
+	// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
+	ToRAMStatus   string `json:"toram_status,omitempty"`
+	// USBExportPath: mount point of the first writable USB drive found, empty if none.
+	USBExportPath string `json:"usb_export_path,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
 	Services      []RuntimeServiceStatus `json:"services,omitempty"`
@@ -182,6 +187,13 @@ type HardwarePCIeDevice struct {
 	BatteryTemperatureC    *float64       `json:"battery_temperature_c,omitempty"`
 	BatteryVoltageV        *float64       `json:"battery_voltage_v,omitempty"`
 	BatteryReplaceRequired *bool          `json:"battery_replace_required,omitempty"`
+	SFPPresent             *bool          `json:"sfp_present,omitempty"`
+	SFPIdentifier          *string        `json:"sfp_identifier,omitempty"`
+	SFPConnector           *string        `json:"sfp_connector,omitempty"`
+	SFPVendor              *string        `json:"sfp_vendor,omitempty"`
+	SFPPartNumber          *string        `json:"sfp_part_number,omitempty"`
+	SFPSerialNumber        *string        `json:"sfp_serial_number,omitempty"`
+	SFPWavelengthNM        *float64       `json:"sfp_wavelength_nm,omitempty"`
 	SFPTemperatureC        *float64       `json:"sfp_temperature_c,omitempty"`
 	SFPTXPowerDBM          *float64       `json:"sfp_tx_power_dbm,omitempty"`
 	SFPRXPowerDBM          *float64       `json:"sfp_rx_power_dbm,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -11,6 +11,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -21,13 +22,305 @@ import (
 )

 var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
+var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
+	if a == nil {
+		return nil, fmt.Errorf("app not configured")
+	}
+	return a.ListNvidiaGPUs()
+}
+var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
+	if a == nil {
+		return nil, fmt.Errorf("app not configured")
+	}
+	return a.ListNvidiaGPUStatuses()
+}

 // ── Job ID counter ────────────────────────────────────────────────────────────

 var jobCounter atomic.Uint64

-func newJobID(prefix string) string {
-	return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
+func newJobID(_ string) string {
+	start := int((jobCounter.Add(1) - 1) % 1000)
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	for offset := 0; offset < 1000; offset++ {
+		n := (start + offset) % 1000
+		id := fmt.Sprintf("TASK-%03d", n)
+		if !taskIDInUseLocked(id) {
+			return id
+		}
+	}
+	return fmt.Sprintf("TASK-%03d", start)
+}
+
+func taskIDInUseLocked(id string) bool {
+	for _, t := range globalQueue.tasks {
+		if t != nil && t.ID == id {
+			return true
+		}
+	}
+	return false
+}
+
+type taskRunResponse struct {
+	TaskID    string   `json:"task_id,omitempty"`
+	JobID     string   `json:"job_id,omitempty"`
+	TaskIDs   []string `json:"task_ids,omitempty"`
+	JobIDs    []string `json:"job_ids,omitempty"`
+	TaskCount int      `json:"task_count,omitempty"`
+}
+
+type nvidiaTaskSelection struct {
+	GPUIndices []int
+	Label      string
+}
+
+func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
+	if len(tasks) == 0 {
+		writeJSON(w, taskRunResponse{})
+		return
+	}
+	ids := make([]string, 0, len(tasks))
+	for _, t := range tasks {
+		if t == nil || strings.TrimSpace(t.ID) == "" {
+			continue
+		}
+		ids = append(ids, t.ID)
+	}
+	resp := taskRunResponse{TaskCount: len(ids)}
+	if len(ids) > 0 {
+		resp.TaskID = ids[0]
+		resp.JobID = ids[0]
+		resp.TaskIDs = ids
+		resp.JobIDs = ids
+	}
+	writeJSON(w, resp)
+}
+
+func shouldSplitHomogeneousNvidiaTarget(target string) bool {
+	switch strings.TrimSpace(target) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
+		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
+		"nvidia-bandwidth", "nvidia-stress":
+		return true
+	default:
+		return false
+	}
+}
+
+func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
+	if len(gpus) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs detected")
+	}
+	indexed := make(map[int]platform.NvidiaGPU, len(gpus))
+	allIndices := make([]int, 0, len(gpus))
+	for _, gpu := range gpus {
+		indexed[gpu.Index] = gpu
+		allIndices = append(allIndices, gpu.Index)
+	}
+	sort.Ints(allIndices)
+
+	selected := allIndices
+	if len(include) > 0 {
+		selected = make([]int, 0, len(include))
+		seen := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			if _, ok := indexed[idx]; !ok {
+				continue
+			}
+			if _, dup := seen[idx]; dup {
+				continue
+			}
+			seen[idx] = struct{}{}
+			selected = append(selected, idx)
+		}
+		sort.Ints(selected)
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected")
+	}
+
+	modelGroups := make(map[string][]platform.NvidiaGPU)
+	modelOrder := make([]string, 0)
+	for _, idx := range selected {
+		gpu := indexed[idx]
+		model := strings.TrimSpace(gpu.Name)
+		if model == "" {
+			model = fmt.Sprintf("GPU %d", gpu.Index)
+		}
+		if _, ok := modelGroups[model]; !ok {
+			modelOrder = append(modelOrder, model)
+		}
+		modelGroups[model] = append(modelGroups[model], gpu)
+	}
+	sort.Slice(modelOrder, func(i, j int) bool {
+		left := modelGroups[modelOrder[i]]
+		right := modelGroups[modelOrder[j]]
+		if len(left) == 0 || len(right) == 0 {
+			return modelOrder[i] < modelOrder[j]
+		}
+		return left[0].Index < right[0].Index
+	})
+
+	var groups []nvidiaTaskSelection
+	var singles []nvidiaTaskSelection
+	for _, model := range modelOrder {
+		group := modelGroups[model]
+		sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
+		indices := make([]int, 0, len(group))
+		for _, gpu := range group {
+			indices = append(indices, gpu.Index)
+		}
+		if len(indices) >= 2 {
+			groups = append(groups, nvidiaTaskSelection{
+				GPUIndices: indices,
+				Label:      fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
+			})
+			continue
+		}
+		gpu := group[0]
+		singles = append(singles, nvidiaTaskSelection{
+			GPUIndices: []int{gpu.Index},
+			Label:      fmt.Sprintf("GPU %d — %s", gpu.Index, model),
+		})
+	}
+	return append(groups, singles...), nil
+}
+
+func joinTaskIndices(indices []int) string {
+	parts := make([]string, 0, len(indices))
+	for _, idx := range indices {
+		parts = append(parts, fmt.Sprintf("%d", idx))
+	}
+	return strings.Join(parts, ",")
+}
+
+func formatSplitTaskName(baseName, selectionLabel string) string {
+	baseName = strings.TrimSpace(baseName)
+	selectionLabel = strings.TrimSpace(selectionLabel)
+	if baseName == "" {
+		return selectionLabel
+	}
+	if selectionLabel == "" {
+		return baseName
+	}
+	return baseName + " (" + selectionLabel + ")"
+}
+
+func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
+	if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs {
+		// Parallel mode (or non-splittable target): one task for all selected GPUs.
+		if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) {
+			// Resolve the selected GPU indices so ExcludeGPUIndices is applied.
+			gpus, err := apiListNvidiaGPUs(appRef)
+			if err != nil {
+				return nil, err
+			}
+			resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices)
+			if err != nil {
+				return nil, err
+			}
+			params.GPUIndices = resolved
+			params.ExcludeGPUIndices = nil
+		}
+		t := &Task{
+			ID:        newJobID(idPrefix),
+			Name:      baseName,
+			Target:    target,
+			Priority:  priority,
+			Status:    TaskPending,
+			CreatedAt: createdAt,
+			params:    params,
+		}
+		return []*Task{t}, nil
+	}
+	gpus, err := apiListNvidiaGPUs(appRef)
+	if err != nil {
+		return nil, err
+	}
+	selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
+	if err != nil {
+		return nil, err
+	}
+	tasks := make([]*Task, 0, len(selections))
+	for _, selection := range selections {
+		taskParamsCopy := params
+		taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
+		taskParamsCopy.ExcludeGPUIndices = nil
+		displayName := formatSplitTaskName(baseName, selection.Label)
+		taskParamsCopy.DisplayName = displayName
+		tasks = append(tasks, &Task{
+			ID:        newJobID(idPrefix),
+			Name:      displayName,
+			Target:    target,
+			Priority:  priority,
+			Status:    TaskPending,
+			CreatedAt: createdAt,
+			params:    taskParamsCopy,
+		})
+	}
+	return tasks, nil
+}
+
+// expandSelectedGPUIndices returns the sorted list of selected GPU indices after
+// applying include/exclude filters, without splitting by model.
+func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) {
+	indexed := make(map[int]struct{}, len(gpus))
+	allIndices := make([]int, 0, len(gpus))
+	for _, gpu := range gpus {
+		indexed[gpu.Index] = struct{}{}
+		allIndices = append(allIndices, gpu.Index)
+	}
+	sort.Ints(allIndices)
+
+	selected := allIndices
+	if len(include) > 0 {
+		selected = make([]int, 0, len(include))
+		seen := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			if _, ok := indexed[idx]; !ok {
+				continue
+			}
+			if _, dup := seen[idx]; dup {
+				continue
+			}
+			seen[idx] = struct{}{}
+			selected = append(selected, idx)
+		}
+		sort.Ints(selected)
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected")
+	}
+	return selected, nil
 }

 // ── SSE helpers ───────────────────────────────────────────────────────────────
@@ -189,12 +482,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 			return
 		}

-		var body struct {
-			Duration           int      `json:"duration"`
-			DiagLevel          int      `json:"diag_level"`
-			GPUIndices         []int    `json:"gpu_indices"`
-			ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
-			Loader             string   `json:"loader"`
+			var body struct {
+				Duration           int      `json:"duration"`
+				StressMode         bool     `json:"stress_mode"`
+				GPUIndices         []int    `json:"gpu_indices"`
+				ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
+				StaggerGPUStart    bool     `json:"stagger_gpu_start"`
+				Loader             string   `json:"loader"`
 			Profile            string   `json:"profile"`
 			DisplayName        string   `json:"display_name"`
 			PlatformComponents []string `json:"platform_components"`
@@ -207,28 +501,29 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		}

 		name := taskDisplayName(target, body.Profile, body.Loader)
-		t := &Task{
-			ID:        newJobID("sat-" + target),
-			Name:      name,
-			Target:    target,
-			Status:    TaskPending,
-			CreatedAt: time.Now(),
-			params: taskParams{
+		if strings.TrimSpace(body.DisplayName) != "" {
+			name = body.DisplayName
+		}
+			params := taskParams{
 				Duration:           body.Duration,
-				DiagLevel:          body.DiagLevel,
+				StressMode:         body.StressMode,
 				GPUIndices:         body.GPUIndices,
 				ExcludeGPUIndices:  body.ExcludeGPUIndices,
+				StaggerGPUStart:    body.StaggerGPUStart,
 				Loader:             body.Loader,
-				BurnProfile:        body.Profile,
-				DisplayName:        body.DisplayName,
-				PlatformComponents: body.PlatformComponents,
-			},
+			BurnProfile:        body.Profile,
+			DisplayName:        body.DisplayName,
+			PlatformComponents: body.PlatformComponents,
 		}
-		if strings.TrimSpace(body.DisplayName) != "" {
-			t.Name = body.DisplayName
+		tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
+		if err != nil {
+			writeError(w, http.StatusBadRequest, err.Error())
+			return
 		}
-		globalQueue.enqueue(t)
-		writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+		for _, t := range tasks {
+			globalQueue.enqueue(t)
+		}
+		writeTaskRunResponse(w, tasks)
 	}
 }

@@ -244,6 +539,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		GPUIndices        []int  `json:"gpu_indices"`
 		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
 		RunNCCL           *bool  `json:"run_nccl"`
+		ParallelGPUs      *bool  `json:"parallel_gpus"`
 		DisplayName       string `json:"display_name"`
 	}
 	if r.Body != nil {
@@ -257,27 +553,31 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 	if body.RunNCCL != nil {
 		runNCCL = *body.RunNCCL
 	}
-	t := &Task{
-		ID:        newJobID("benchmark-nvidia"),
-		Name:      taskDisplayName("nvidia-benchmark", "", ""),
-		Target:    "nvidia-benchmark",
-		Priority:  15,
-		Status:    TaskPending,
-		CreatedAt: time.Now(),
-		params: taskParams{
-			GPUIndices:        body.GPUIndices,
-			ExcludeGPUIndices: body.ExcludeGPUIndices,
-			SizeMB:            body.SizeMB,
-			BenchmarkProfile:  body.Profile,
-			RunNCCL:           runNCCL,
-			DisplayName:       body.DisplayName,
-		},
+	parallelGPUs := false
+	if body.ParallelGPUs != nil {
+		parallelGPUs = *body.ParallelGPUs
 	}
+	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
-		t.Name = body.DisplayName
+		name = body.DisplayName
 	}
-	globalQueue.enqueue(t)
-	writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
+		GPUIndices:        body.GPUIndices,
+		ExcludeGPUIndices: body.ExcludeGPUIndices,
+		SizeMB:            body.SizeMB,
+		BenchmarkProfile:  body.Profile,
+		RunNCCL:           runNCCL,
+		ParallelGPUs:      parallelGPUs,
+		DisplayName:       body.DisplayName,
+	}, name, h.opts.App, "benchmark-nvidia")
+	if err != nil {
+		writeError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	for _, t := range tasks {
+		globalQueue.enqueue(t)
+	}
+	writeTaskRunResponse(w, tasks)
 }

 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
@@ -383,11 +683,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
 		return
 	}
 	result, err := h.opts.App.ServiceActionResult(req.Name, action)
+	status := "ok"
 	if err != nil {
-		writeError(w, http.StatusInternalServerError, err.Error())
-		return
+		status = "error"
 	}
-	writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
+	// Always return 200 with output so the frontend can display the actual
+	// systemctl error message instead of a generic "exit status 1".
+	writeJSON(w, map[string]string{"status": status, "output": result.Body})
 }

 // ── Network ───────────────────────────────────────────────────────────────────
@@ -555,6 +857,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
 	writeJSON(w, gpus)
 }

+func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	if gpus == nil {
+		gpus = []platform.NvidiaGPUStatus{}
+	}
+	writeJSON(w, gpus)
+}
+
+func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	var req struct {
+		Index int `json:"index"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	result, err := h.opts.App.ResetNvidiaGPU(req.Index)
+	status := "ok"
+	if err != nil {
+		status = "error"
+	}
+	writeJSON(w, map[string]string{"status": status, "output": result.Body})
+}
+
 func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -1040,107 +1378,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
 	return nil
 }

-// ── Display / Screen Resolution ───────────────────────────────────────────────
-
-type displayMode struct {
-	Output  string `json:"output"`
-	Mode    string `json:"mode"`
-	Current bool   `json:"current"`
-}
-
-type displayInfo struct {
-	Output  string        `json:"output"`
-	Modes   []displayMode `json:"modes"`
-	Current string        `json:"current"`
-}
-
-var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
-var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
-var xrandrCurrentRE = regexp.MustCompile(`\*`)
-
-func parseXrandrOutput(out string) []displayInfo {
-	var infos []displayInfo
-	var cur *displayInfo
-	for _, line := range strings.Split(out, "\n") {
-		if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
-			if cur != nil {
-				infos = append(infos, *cur)
-			}
-			cur = &displayInfo{Output: m[1]}
-			continue
-		}
-		if cur == nil {
-			continue
-		}
-		if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
-			isCurrent := xrandrCurrentRE.MatchString(line)
-			mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
-			cur.Modes = append(cur.Modes, mode)
-			if isCurrent {
-				cur.Current = m[1]
-			}
-		}
-	}
-	if cur != nil {
-		infos = append(infos, *cur)
-	}
-	return infos
-}
-
-func xrandrCommand(args ...string) *exec.Cmd {
-	cmd := exec.Command("xrandr", args...)
-	env := append([]string{}, os.Environ()...)
-	hasDisplay := false
-	hasXAuthority := false
-	for _, kv := range env {
-		if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
-			hasDisplay = true
-		}
-		if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
-			hasXAuthority = true
-		}
-	}
-	if !hasDisplay {
-		env = append(env, "DISPLAY=:0")
-	}
-	if !hasXAuthority {
-		env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
-	}
-	cmd.Env = env
-	return cmd
-}
-
-func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
-	out, err := xrandrCommand().Output()
-	if err != nil {
-		writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
-		return
-	}
-	writeJSON(w, parseXrandrOutput(string(out)))
-}
-
-func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
-	var req struct {
-		Output string `json:"output"`
-		Mode   string `json:"mode"`
-	}
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
-		writeError(w, http.StatusBadRequest, "output and mode are required")
-		return
-	}
-	// Validate mode looks like WxH to prevent injection
-	if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
-		writeError(w, http.StatusBadRequest, "invalid mode format")
-		return
-	}
-	// Validate output name (no special chars)
-	if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
-		writeError(w, http.StatusBadRequest, "invalid output name")
-		return
-	}
-	if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
-		writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
-		return
-	}
-	writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
-}
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -1,6 +1,7 @@
 package webui

 import (
+	"encoding/json"
 	"net/http/httptest"
 	"strings"
 	"testing"
@@ -9,30 +10,6 @@ import (
 	"bee/audit/internal/platform"
 )

-func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
-	t.Setenv("DISPLAY", "")
-	t.Setenv("XAUTHORITY", "")
-
-	cmd := xrandrCommand("--query")
-
-	var hasDisplay bool
-	var hasXAuthority bool
-	for _, kv := range cmd.Env {
-		if kv == "DISPLAY=:0" {
-			hasDisplay = true
-		}
-		if kv == "XAUTHORITY=/home/bee/.Xauthority" {
-			hasXAuthority = true
-		}
-	}
-	if !hasDisplay {
-		t.Fatalf("DISPLAY not injected: %v", cmd.Env)
-	}
-	if !hasXAuthority {
-		t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
-	}
-}
-
 func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
@@ -74,6 +51,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 3, Name: "NVIDIA H100 PCIe"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })

 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
@@ -101,6 +86,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	}
 }

+func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRun(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var resp taskRunResponse
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode response: %v", err)
+	}
+	if len(resp.TaskIDs) != 2 {
+		t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
+func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
 	h := &handler{}
 	h.pushFanRings([]platform.FanReading{
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
 		}
 	}

+	// Downsample to at most ~1400 points (one per pixel) before building SVG.
+	times, datasets = downsampleTimeSeries(times, datasets, 1400)
+	pointCount = len(times)
+
 	statsLabel := chartStatsLabel(datasets)

 	legendItems := []metricChartSeries{}
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
 		}
 	}

+	// Downsample to at most ~1400 points before building SVG.
+	{
+		datasets := make([][]float64, len(series))
+		for i := range series {
+			datasets[i] = series[i].Values
+		}
+		times, datasets = downsampleTimeSeries(times, datasets, 1400)
+		pointCount = len(times)
+		for i := range series {
+			series[i].Values = datasets[i]
+		}
+	}
+
 	scales := make([]chartScale, len(series))
 	for i := range series {
 		min, max := chartSeriesBounds(series[i].Values)
@@ -626,6 +643,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
 	b.WriteString(`</g>` + "\n")
 }

+// downsampleTimeSeries reduces the time series to at most maxPts points using
+// min-max bucketing. Each bucket contributes the index of its min and max value
+// (using the first full-length dataset as the reference series). All parallel
+// datasets are sampled at those same indices so all series stay aligned.
+// If len(times) <= maxPts the inputs are returned unchanged.
+func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
+	n := len(times)
+	if n <= maxPts || maxPts <= 0 {
+		return times, datasets
+	}
+	buckets := maxPts / 2
+	if buckets < 1 {
+		buckets = 1
+	}
+	// Use the first dataset that has the same length as times as the reference
+	// for deciding which two indices to keep per bucket.
+	var ref []float64
+	for _, ds := range datasets {
+		if len(ds) == n {
+			ref = ds
+			break
+		}
+	}
+	selected := make([]int, 0, maxPts)
+	bucketSize := float64(n) / float64(buckets)
+	for b := 0; b < buckets; b++ {
+		lo := int(math.Round(float64(b) * bucketSize))
+		hi := int(math.Round(float64(b+1) * bucketSize))
+		if hi > n {
+			hi = n
+		}
+		if lo >= hi {
+			continue
+		}
+		if ref == nil {
+			selected = append(selected, lo)
+			if hi-1 != lo {
+				selected = append(selected, hi-1)
+			}
+			continue
+		}
+		minIdx, maxIdx := lo, lo
+		for i := lo + 1; i < hi; i++ {
+			if ref[i] < ref[minIdx] {
+				minIdx = i
+			}
+			if ref[i] > ref[maxIdx] {
+				maxIdx = i
+			}
+		}
+		if minIdx <= maxIdx {
+			selected = append(selected, minIdx)
+			if maxIdx != minIdx {
+				selected = append(selected, maxIdx)
+			}
+		} else {
+			selected = append(selected, maxIdx)
+			if minIdx != maxIdx {
+				selected = append(selected, minIdx)
+			}
+		}
+	}
+	outTimes := make([]time.Time, len(selected))
+	for i, idx := range selected {
+		outTimes[i] = times[idx]
+	}
+	outDatasets := make([][]float64, len(datasets))
+	for d, ds := range datasets {
+		if len(ds) != n {
+			outDatasets[d] = ds
+			continue
+		}
+		out := make([]float64, len(selected))
+		for i, idx := range selected {
+			out[i] = ds[idx]
+		}
+		outDatasets[d] = out
+	}
+	return outTimes, outDatasets
+}
+
 func chartXForTime(ts, start, end time.Time, left, right int) float64 {
 	if !end.After(start) {
 		return float64(left+right) / 2
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -270,6 +270,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
 	mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
 	mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
+	mux.HandleFunc("GET /api/tasks/{id}/charts", h.handleAPITaskChartsIndex)
+	mux.HandleFunc("GET /api/tasks/{id}/chart/", h.handleAPITaskChartSVG)
 	mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)

 	// Services
@@ -293,13 +295,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Tools
 	mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)

-	// Display
-	mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
-	mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
-
 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
 	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
+	mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
+	mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
 	mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)

 	// System
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -1,6 +1,7 @@
 package webui

 import (
+	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -590,7 +591,7 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
 	}
 }

-func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
+func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
@@ -598,11 +599,20 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
+	if !strings.Contains(body, `NVIDIA Self Heal`) {
+		t.Fatalf("tools page missing nvidia self heal section: %s", body)
+	}
 	if !strings.Contains(body, `Restart GPU Drivers`) {
 		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
 	}
-	if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
-		t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
+	if !strings.Contains(body, `nvidiaRestartDrivers()`) {
+		t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
+	}
+	if !strings.Contains(body, `/api/gpu/nvidia-status`) {
+		t.Fatalf("tools page missing nvidia status api usage: %s", body)
+	}
+	if !strings.Contains(body, `nvidiaResetGPU(`) {
+		t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
 	}
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
@@ -636,6 +646,66 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 	}
 }

+func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
+	dir := t.TempDir()
+	exportDir := filepath.Join(dir, "export")
+	runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	result := platform.NvidiaBenchmarkResult{
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
+		BenchmarkProfile: "standard",
+		OverallStatus:    "OK",
+		GPUs: []platform.BenchmarkGPUResult{
+			{
+				Index: 0,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1176.25,
+				},
+			},
+			{
+				Index: 1,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1168.50,
+				},
+			},
+		},
+	}
+	raw, err := json.Marshal(result)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{ExportDir: exportDir})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
+	for _, needle := range []string{
+		`Benchmark Results`,
+		`Composite score by saved benchmark run and GPU.`,
+		`GPU #0 — NVIDIA H100 PCIe`,
+		`GPU #1 — NVIDIA H100 PCIe`,
+		`#1`,
+		wantTime,
+		`1176.25`,
+		`1168.50`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("benchmark page missing %q: %s", needle, body)
+		}
+	}
+}
+
 func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
@@ -649,6 +719,10 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 		`nvidia-targeted-stress`,
 		`controlled NVIDIA DCGM load`,
 		`<code>dcgmi diag targeted_stress</code>`,
+		`NVIDIA GPU Selection`,
+		`All NVIDIA validate tasks use only the GPUs selected here.`,
+		`Select All`,
+		`id="sat-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
@@ -667,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	for _, needle := range []string{
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
-		`targeted_stress remain in <a href="/validate">Validate</a>`,
-		`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
+		`NCCL`,
+		`Validate → Stress mode`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
@@ -723,6 +797,111 @@ func TestTaskDetailPageRendersSavedReport(t *testing.T) {
 	}
 }

+func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "task-live-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `Cancel</button>`) {
+		t.Fatalf("task detail page missing cancel button: %s", body)
+	}
+	if !strings.Contains(body, `function cancelTaskDetail(id)`) {
+		t.Fatalf("task detail page missing cancel handler: %s", body)
+	}
+	if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
+		t.Fatalf("task detail page missing cancel endpoint: %s", body)
+	}
+	if !strings.Contains(body, `id="task-live-charts"`) {
+		t.Fatalf("task detail page missing live charts container: %s", body)
+	}
+	if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
+		t.Fatalf("task detail page missing live charts index endpoint: %s", body)
+	}
+}
+
+func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	db, err := openMetricsDB(metricsPath)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	base := time.Now().UTC()
+	samples := []platform.LiveMetricSample{
+		{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
+		{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
+		{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
+	}
+	for _, sample := range samples {
+		if err := db.Write(sample); err != nil {
+			t.Fatalf("Write: %v", err)
+		}
+	}
+	_ = db.Close()
+
+	started := base.Add(-2*time.Minute - 5*time.Second)
+	done := base.Add(-1*time.Minute + 5*time.Second)
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "task-chart-1",
+		Name:      "Power Window",
+		Target:    "cpu",
+		Status:    TaskDone,
+		CreatedAt: started.Add(-10 * time.Second),
+		StartedAt: &started,
+		DoneAt:    &done,
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
+	req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
+	req.SetPathValue("id", "task-chart-1")
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, req)
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, "System Power") {
+		t.Fatalf("task chart missing expected title: %s", body)
+	}
+	if !strings.Contains(body, "min 200") {
+		t.Fatalf("task chart stats should start from in-window sample: %s", body)
+	}
+	if strings.Contains(body, "min 100") {
+		t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
+	}
+}
+
 func TestViewerRendersLatestSnapshot(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
@@ -915,6 +1094,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
+		// Runtime Health card — LiveCD checks only
 		`Runtime Health`,
 		`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
 		`Export Directory`,
@@ -923,16 +1103,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 		`CUDA / ROCm`,
 		`Required Utilities`,
 		`Bee Services`,
-		`<td>CPU</td>`,
-		`<td>Memory</td>`,
-		`<td>Storage</td>`,
-		`<td>GPU</td>`,
 		`CUDA runtime is not ready for GPU SAT.`,
 		`Missing: nvidia-smi`,
 		`bee-nvidia=inactive`,
-		`cpu SAT: FAILED`,
-		`storage SAT: FAILED`,
-		`sat:nvidia`,
+		// Hardware Summary card — component health badges
+		`Hardware Summary`,
+		`>CPU<`,
+		`>Memory<`,
+		`>Storage<`,
+		`>GPU<`,
+		`>PSU<`,
+		`badge-warn`,   // cpu Warning badge
+		`badge-err`,    // storage Critical badge
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("dashboard missing %q: %s", needle, body)
--- a/audit/internal/webui/task_page.go
+++ b/audit/internal/webui/task_page.go
@@ -1,11 +1,15 @@
 package webui

 import (
+	"encoding/json"
 	"fmt"
 	"html"
 	"net/http"
 	"os"
 	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
 )

 func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
@@ -22,6 +26,51 @@ func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
 	_, _ = w.Write([]byte(body))
 }

+func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
+	task, samples, _, _, ok := h.taskSamplesForRequest(r)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	type taskChartIndexEntry struct {
+		Title string `json:"title"`
+		File  string `json:"file"`
+	}
+	entries := make([]taskChartIndexEntry, 0)
+	for _, spec := range taskChartSpecsForSamples(samples) {
+		title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
+		if !ok {
+			continue
+		}
+		entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
+	}
+	w.Header().Set("Cache-Control", "no-store")
+	w.Header().Set("Content-Type", "application/json; charset=utf-8")
+	_ = json.NewEncoder(w).Encode(entries)
+}
+
+func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
+	task, samples, _, _, ok := h.taskSamplesForRequest(r)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
+	path, ok := taskChartPathFromFile(file)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
+	if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
+		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
+		return
+	}
+	w.Header().Set("Content-Type", "image/svg+xml")
+	w.Header().Set("Cache-Control", "no-store")
+	_, _ = w.Write(buf)
+}
+
 func renderTaskDetailPage(opts HandlerOptions, task Task) string {
 	title := task.Name
 	if strings.TrimSpace(title) == "" {
@@ -30,6 +79,9 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
 	var body strings.Builder
 	body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
 	body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
+	if task.Status == TaskRunning || task.Status == TaskPending {
+		body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
+	}
 	body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
 	body.WriteString(`</div>`)

@@ -45,17 +97,113 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
 		body.WriteString(`</div></div>`)
 	}

+	if task.Status == TaskRunning {
+		body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
+		body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
+		body.WriteString(`</div></div>`)
+	}
+
 	if task.Status == TaskRunning || task.Status == TaskPending {
 		body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
 		body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
 		body.WriteString(`</div></div>`)
 		body.WriteString(`<script>
+function cancelTaskDetail(id) {
+  fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
+    var term = document.getElementById('task-live-log');
+    if (term) {
+      term.textContent += '\nCancel requested.\n';
+      term.scrollTop = term.scrollHeight;
+    }
+  });
+}
+function renderTaskLiveCharts(taskId, charts) {
+  const host = document.getElementById('task-live-charts');
+  if (!host) return;
+  if (!Array.isArray(charts) || charts.length === 0) {
+    host.innerHTML = 'Waiting for metric samples...';
+    return;
+  }
+  const seen = {};
+  charts.forEach(function(chart) {
+    seen[chart.file] = true;
+    let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
+    if (img) {
+      const card = img.closest('.card');
+      if (card) {
+        const title = card.querySelector('.card-head');
+        if (title) title.textContent = chart.title;
+      }
+      return;
+    }
+    const card = document.createElement('div');
+    card.className = 'card';
+    card.style.margin = '0';
+    card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
+    card.querySelector('.card-head').textContent = chart.title;
+    const body = card.querySelector('.card-body');
+    img = document.createElement('img');
+    img.setAttribute('data-task-chart', '1');
+    img.setAttribute('data-chart-file', chart.file);
+    img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
+    img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
+    img.style.width = '100%';
+    img.style.display = 'block';
+    img.style.borderRadius = '6px';
+    img.alt = chart.title;
+    body.appendChild(img);
+    host.appendChild(card);
+  });
+  Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
+    const file = img.getAttribute('data-chart-file') || '';
+    if (seen[file]) return;
+    const card = img.closest('.card');
+    if (card) card.remove();
+  });
+}
+function loadTaskLiveCharts(taskId) {
+  fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
+    renderTaskLiveCharts(taskId, charts);
+  }).catch(function(){
+    const host = document.getElementById('task-live-charts');
+    if (host) host.innerHTML = 'Task charts are unavailable.';
+  });
+}
+function refreshTaskLiveCharts() {
+  document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
+    const base = img.dataset.baseSrc;
+    if (!base) return;
+    img.src = base + '?t=' + Date.now();
+  });
+}
 var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
 var _taskDetailTerm = document.getElementById('task-live-log');
+var _taskChartTimer = null;
+var _taskChartsFrozen = false;
 _taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
 _taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
-_taskDetailES.addEventListener('done', function(){ _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
-_taskDetailES.onerror = function(){ _taskDetailES.close(); };
+_taskDetailES.addEventListener('done', function(e){
+  if (_taskChartTimer) clearInterval(_taskChartTimer);
+  _taskDetailES.close();
+  _taskDetailES = null;
+  _taskChartsFrozen = true;
+  _taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
+  _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
+  refreshTaskLiveCharts();
+});
+_taskDetailES.onerror = function(){
+  if (_taskChartTimer) clearInterval(_taskChartTimer);
+  if (_taskDetailES) {
+    _taskDetailES.close();
+    _taskDetailES = null;
+  }
+};
+loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
+_taskChartTimer = setInterval(function(){
+  if (_taskChartsFrozen) return;
+  loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
+  refreshTaskLiveCharts();
+}, 2000);
 </script>`)
 	}

@@ -83,3 +231,37 @@ func taskArtifactDownloadLink(task Task, absPath string) string {
 	}
 	return fmt.Sprintf(`/export/file?path=%s`, absPath)
 }
+
+func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
+	id := r.PathValue("id")
+	taskPtr, ok := globalQueue.findByID(id)
+	if !ok {
+		return Task{}, nil, time.Time{}, time.Time{}, false
+	}
+	task := *taskPtr
+	start, end := taskTimeWindow(&task)
+	samples, err := loadTaskMetricSamples(start, end)
+	if err != nil {
+		return task, nil, start, end, true
+	}
+	return task, samples, start, end, true
+}
+
+func taskTimelineForTask(task Task) []chartTimelineSegment {
+	start, end := taskTimeWindow(&task)
+	return []chartTimelineSegment{{Start: start, End: end, Active: true}}
+}
+
+func taskChartPathFromFile(file string) (string, bool) {
+	file = strings.TrimSpace(file)
+	for _, spec := range taskDashboardChartSpecs {
+		if spec.File == file {
+			return spec.Path, true
+		}
+	}
+	if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
+		id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
+		return "gpu/" + id + "-overview", true
+	}
+	return "", false
+}
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -53,6 +53,18 @@ var taskDashboardChartSpecs = []taskChartSpec{
 	{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
 }

+func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
+	specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
+	specs = append(specs, taskDashboardChartSpecs...)
+	for _, idx := range taskGPUIndices(samples) {
+		specs = append(specs, taskChartSpec{
+			Path: fmt.Sprintf("gpu/%d-overview", idx),
+			File: fmt.Sprintf("gpu-%d-overview.svg", idx),
+		})
+	}
+	return specs
+}
+
 func writeTaskReportArtifacts(t *Task) error {
 	if t == nil {
 		return nil
@@ -136,7 +148,7 @@ func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMe
 	timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
 	var charts []taskReportChart
 	inline := make(map[string]string)
-	for _, spec := range taskDashboardChartSpecs {
+	for _, spec := range taskChartSpecsForSamples(samples) {
 		title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
 		if !ok || len(svg) == 0 {
 			continue
@@ -148,24 +160,17 @@ func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMe
 		charts = append(charts, taskReportChart{Title: title, File: spec.File})
 		inline[spec.File] = string(svg)
 	}
-
-	for _, idx := range taskGPUIndices(samples) {
-		file := fmt.Sprintf("gpu-%d-overview.svg", idx)
-		svg, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
-		if err != nil || !ok || len(svg) == 0 {
-			continue
-		}
-		path := filepath.Join(dir, file)
-		if err := os.WriteFile(path, svg, 0644); err != nil {
-			continue
-		}
-		charts = append(charts, taskReportChart{Title: gpuDisplayLabel(idx) + " Overview", File: file})
-		inline[file] = string(svg)
-	}
 	return charts, inline
 }

 func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
+	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
+		buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		if err != nil || !hasData {
+			return "", nil, false
+		}
+		return gpuDisplayLabel(idx) + " Overview", buf, true
+	}
 	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		return "", nil, false
@@ -225,15 +230,16 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
 	b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
 	b.WriteString(`</div></div></div>`)
+	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
+		b.WriteString(benchmarkCard)
+	}

 	if len(report.Charts) > 0 {
-		b.WriteString(`<div class="grid2">`)
 		for _, chart := range report.Charts {
 			b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
 			b.WriteString(charts[chart.File])
 			b.WriteString(`</div></div>`)
 		}
-		b.WriteString(`</div>`)
 	} else {
 		b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
 	}
@@ -244,6 +250,57 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	return b.String()
 }

+func renderTaskBenchmarkResultsCard(target, logText string) string {
+	if strings.TrimSpace(target) != "nvidia-benchmark" {
+		return ""
+	}
+	resultPath := taskBenchmarkResultPath(logText)
+	if strings.TrimSpace(resultPath) == "" {
+		return ""
+	}
+	columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
+	if len(runs) == 0 {
+		return ""
+	}
+	return renderBenchmarkResultsCardFromRuns(
+		"Benchmark Results",
+		"Composite score for this benchmark task.",
+		"No benchmark results were saved for this task.",
+		columns,
+		runs,
+	)
+}
+
+func taskBenchmarkResultPath(logText string) string {
+	archivePath := taskArchivePathFromLog(logText)
+	if archivePath == "" {
+		return ""
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	if runDir == archivePath {
+		return ""
+	}
+	return filepath.Join(runDir, "result.json")
+}
+
+func taskArchivePathFromLog(logText string) string {
+	lines := strings.Split(logText, "\n")
+	for i := len(lines) - 1; i >= 0; i-- {
+		line := strings.TrimSpace(lines[i])
+		if line == "" || !strings.HasPrefix(line, "Archive:") {
+			continue
+		}
+		path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
+		if strings.HasPrefix(path, "Archive written to ") {
+			path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
+		}
+		if strings.HasSuffix(path, ".tar.gz") {
+			return path
+		}
+	}
+	return ""
+}
+
 func renderTaskStatusBadge(status string) string {
 	className := map[string]string{
 		TaskRunning:   "badge-ok",
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -115,14 +115,17 @@ type Task struct {
 // taskParams holds optional parameters parsed from the run request.
 type taskParams struct {
 	Duration           int      `json:"duration,omitempty"`
-	DiagLevel          int      `json:"diag_level,omitempty"`
+	StressMode         bool     `json:"stress_mode,omitempty"`
 	GPUIndices         []int    `json:"gpu_indices,omitempty"`
 	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
+	StaggerGPUStart    bool     `json:"stagger_gpu_start,omitempty"`
 	SizeMB             int      `json:"size_mb,omitempty"`
+	Passes             int      `json:"passes,omitempty"`
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
+	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
@@ -160,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
 	}
 }

+func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
+	if enabled && len(selected) > 1 {
+		return 180
+	}
+	return 0
+}
+
 func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
 	acceptanceCycles := []platform.PlatformStressCycle{
 		{LoadSec: 85, IdleSec: 5},
@@ -214,11 +224,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
 const maxTaskHistory = 50

 var (
-	runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-		return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
+	runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
+		return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc)
 	}
-	runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-		return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
+	runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
+		return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
 	}
 	runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 		return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
@@ -423,13 +433,14 @@ func (q *taskQueue) worker() {
 			setCPUGovernor("performance")
 			defer setCPUGovernor("powersave")

-			// Drain all pending tasks and start them in parallel.
-			q.mu.Lock()
-			var batch []*Task
 			for {
+				q.mu.Lock()
 				t := q.nextPending()
 				if t == nil {
-					break
+					q.prune()
+					q.persistLocked()
+					q.mu.Unlock()
+					return
 				}
 				now := time.Now()
 				t.Status = TaskRunning
@@ -438,29 +449,14 @@ func (q *taskQueue) worker() {
 				t.ErrMsg = ""
 				j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
 				t.job = j
-				batch = append(batch, t)
-			}
-			if len(batch) > 0 {
 				q.persistLocked()
-			}
-			q.mu.Unlock()
+				q.mu.Unlock()

-			var wg sync.WaitGroup
-			for _, t := range batch {
-				t := t
-				j := t.job
 				taskCtx, taskCancel := context.WithCancel(context.Background())
 				j.cancel = taskCancel
-				wg.Add(1)
-				goRecoverOnce("task "+t.Target, func() {
-					defer wg.Done()
-					defer taskCancel()
-					q.executeTask(t, j, taskCtx)
-				})
-			}
-			wg.Wait()
+				q.executeTask(t, j, taskCtx)
+				taskCancel()

-			if len(batch) > 0 {
 				q.mu.Lock()
 				q.prune()
 				q.persistLocked()
@@ -565,7 +561,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		diagLevel := t.params.DiagLevel
+		diagLevel := 2
+		if t.params.StressMode {
+			diagLevel = 3
+		}
 		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
 			result, e := a.RunNvidiaAcceptancePackWithOptions(
 				ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -599,8 +598,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RunNCCL:           t.params.RunNCCL,
+			ParallelGPUs:      t.params.ParallelGPUs,
 		}, j.append)
-	case "nvidia-compute":
+		case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
@@ -609,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
-		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
+			staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
+			if staggerSec > 0 {
+				j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
+			}
+			archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
 	case "nvidia-targeted-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -659,24 +663,29 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
-		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
-			DurationSec:       dur,
-			Loader:            t.params.Loader,
-			GPUIndices:        t.params.GPUIndices,
-			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
-		}, j.append)
+			archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
+				DurationSec:       dur,
+				Loader:            t.params.Loader,
+				GPUIndices:        t.params.GPUIndices,
+				ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+				StaggerSeconds:    boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
+			}, j.append)
 	case "memory":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
+		sizeMB, passes := 256, 1
+		if t.params.StressMode {
+			sizeMB, passes = 1024, 3
+		}
+		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
+		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
 	case "cpu":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -687,7 +696,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		if dur <= 0 {
-			dur = 60
+			if t.params.StressMode {
+				dur = 1800
+			} else {
+				dur = 60
+			}
 		}
 		j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
 		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
@@ -1163,7 +1176,32 @@ func taskArtifactsDir(root string, t *Task, status string) string {
 	if strings.TrimSpace(root) == "" || t == nil {
 		return ""
 	}
-	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
+	prefix := taskFolderNumberPrefix(t.ID)
+	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
+}
+
+func taskFolderNumberPrefix(taskID string) string {
+	taskID = strings.TrimSpace(taskID)
+	if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
+		num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
+		if len(num) == 3 {
+			allDigits := true
+			for _, r := range num {
+				if r < '0' || r > '9' {
+					allDigits = false
+					break
+				}
+			}
+			if allDigits {
+				return num
+			}
+		}
+	}
+	fallback := sanitizeTaskFolderPart(taskID)
+	if fallback == "" {
+		return "000"
+	}
+	return fallback
 }

 func ensureTaskReportPaths(t *Task) {
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -163,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
 	}
 }

+func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	origCounter := jobCounter.Load()
+	jobCounter.Store(0)
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+		jobCounter.Store(origCounter)
+	})
+
+	if got := newJobID("ignored"); got != "TASK-000" {
+		t.Fatalf("id=%q want TASK-000", got)
+	}
+	if got := newJobID("ignored"); got != "TASK-001" {
+		t.Fatalf("id=%q want TASK-001", got)
+	}
+}
+
+func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
+	root := t.TempDir()
+	task := &Task{
+		ID:   "TASK-007",
+		Name: "NVIDIA Benchmark",
+	}
+	got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
+	if !strings.HasPrefix(got, "007_") {
+		t.Fatalf("artifacts dir=%q want prefix 007_", got)
+	}
+}
+
 func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
 	dir := t.TempDir()
 	logPath := filepath.Join(dir, "task.log")
@@ -325,6 +359,78 @@ func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
 	}
 }

+func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	result := platform.NvidiaBenchmarkResult{
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
+		BenchmarkProfile: "standard",
+		OverallStatus:    "OK",
+		GPUs: []platform.BenchmarkGPUResult{
+			{
+				Index: 0,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1176.25,
+				},
+			},
+		},
+	}
+	raw, err := json.Marshal(result)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
+	if err := os.MkdirAll(artifactsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	task := &Task{
+		ID:           "task-bench",
+		Name:         "NVIDIA Benchmark",
+		Target:       "nvidia-benchmark",
+		Status:       TaskDone,
+		CreatedAt:    time.Now().UTC().Add(-time.Minute),
+		ArtifactsDir: artifactsDir,
+	}
+	ensureTaskReportPaths(task)
+	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
+	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := writeTaskReportArtifacts(task); err != nil {
+		t.Fatalf("writeTaskReportArtifacts: %v", err)
+	}
+
+	body, err := os.ReadFile(task.ReportHTMLPath)
+	if err != nil {
+		t.Fatalf("ReadFile(report.html): %v", err)
+	}
+	html := string(body)
+	for _, needle := range []string{
+		`Benchmark Results`,
+		`Composite score for this benchmark task.`,
+		`GPU #0 — NVIDIA H100 PCIe`,
+		`1176.25`,
+	} {
+		if !strings.Contains(html, needle) {
+			t.Fatalf("report missing %q: %s", needle, html)
+		}
+	}
+}
+
 func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
 	var lines []string
 	prev := taskSerialWriteLine
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -0,0 +1,248 @@
+# Benchmark clock calibration research
+
+## Status
+In progress. Baseline data from production servers pending.
+
+## Background
+
+The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
+before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
+`avg_steady_clock < locked_target * 0.90`.
+
+Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
+even a healthy GPU in a non-ideal server will sustain clocks well below boost.
+The 90% threshold has no empirical basis.
+
+## Key observations (2026-04-06)
+
+### H100 PCIe — new card, server not designed for it
+- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
+- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
+- Stability: 70.0 — clocks erratic, no equilibrium found
+- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
+
+### H200 NVL — new card, server not designed for it
+- avg clock = P95 = 1635 MHz (perfectly stable)
+- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
+- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
+- Degradation: power_capped, thermal_limited
+- Compute: 989 TOPS — card is computing correctly for its frequency
+
+### Key insight
+The meaningful distinction is not *whether* the card throttles but *how stably*
+it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
+H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
+instability may reflect a more severe thermal mismatch or a card issue.
+
+`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
+`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
+
+## Hypothesis for baseline
+
+After testing on servers designed for their GPUs (proper cooling):
+- Healthy GPU under sustained load will run at a stable fraction of boost
+- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
+- Base clock (`clocks.base.gr`) may be a better reference than boost:
+  a healthy card under real workload should comfortably exceed base clock
+
+## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
+
+Source: external stress test tool, ~90s runs, designed server, adequate power.
+
+### Healthy fingerprint
+
+- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
+- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
+  - Avg steady (visual): **~1580–1620 MHz**
+  - vs boost 1755 MHz: **~91–92%**
+  - Oscillation is NORMAL — this is the boost algorithm balancing under power cap
+  - Stable power + oscillating clocks = healthy power-cap behavior
+- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
+- **Consistency**: all 10 samples within ±20 MHz — very repeatable
+
+### Characteristic patten
+Flat power line + oscillating/declining clock line = GPU correctly managed by
+power cap algorithm. Do NOT flag this as instability.
+
+### Clock CV implication
+The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
+The current `variance_too_high` threshold (StabilityScore < 85) may fire on
+healthy HBM2e PCIe cards. Needs recalibration.
+
+---
+
+## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
+
+Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
+Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
+
+### GPU clock reference (from nvidia-smi, idle):
+- base_clock_mhz: **1095**
+- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
+- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
+- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
+
+### Observed under 700W sustained load (both samples nearly identical):
+- Power: ~700W flat — SXM slot, adequate power confirmed
+- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
+- vs 1980 MHz (lock target): **72–74%** — severely below
+- vs 1755 MHz (nvidia-smi boost): **81–83%**
+- vs 1095 MHz (base): 130% — above base but far below expected for SXM
+- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
+- Temperature: 38°C → 79–80°C (same rate as HBM2e)
+- Oscillation: present, similar character to HBM2e but at much lower frequency
+
+### Diagnosis
+These restored cards are degraded. A healthy H100 SXM in a designed server
+(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
+The 72–74% result is a clear signal of silicon or VRM degradation from the
+refurbishment process.
+
+### Clock pattern note
+Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
+to images 19/20. Both sample sets show same degraded pattern — same batch.
+
+---
+
+## Baseline matrix (filled where data available)
+
+| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
+|---|---|---|---|---|---|
+| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
+| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
+| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
+| H200 NVL | designed | TBD | TBD | TBD | need baseline |
+
+---
+
+## H100 official spec (from NVIDIA datasheet)
+
+Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
+All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
+
+| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
+|---|---|---|---|---|---|
+| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
+| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
+| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
+| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
+
+Notes:
+- SXM boards do NOT list FP8 peak in this table (field empty)
+- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
+- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
+
+## Observed efficiency (H100 80GB PCIe, throttled server)
+
+From the report in this session (power+thermal throttle throughout steady):
+
+| Precision | Measured | Spec (dense) | % of spec |
+|---|---|---|---|
+| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
+| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
+| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
+
+33–44% of spec is expected given sustained power+thermal throttle (avg clock
+1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
+actual frequency — the low TOPS comes from throttle, not silicon defect.
+
+## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
+
+Format: without sparsity / with sparsity.
+
+| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
+|---|---|---|---|---|---|
+| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
+| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
+
+## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
+
+Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
+
+| Precision | Measured | Spec (dense) | % of spec |
+|---|---|---|---|
+| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
+| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
+| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
+
+Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
+both are throttle-limited. Confirms that % of spec is not a quality signal,
+it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
+
+## Real-world GEMM efficiency reference (2026-04-06, web research)
+
+Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
+worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
+
+### What healthy systems actually achieve:
+- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
+- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
+- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
+
+### Our results vs expectation:
+| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
+|---|---|---|---|---|
+| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
+| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
+
+Our results are roughly **half** of what a healthy system achieves even under throttle.
+This is NOT normal — 30-44% is not the industry baseline.
+
+### Likely causes of the gap (in order of probability):
+1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
+2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
+   Previous user may have set a lower limit via nvidia-smi -pl and it was not
+   reset. Our normalization sets clock locks but does NOT reset power limit.
+   Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
+3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
+   8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
+
+### Power limit gap analysis (H100 PCIe):
+- Avg clock 1384 MHz = 79% of boost 1755 MHz
+- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
+- Actually measured: 329 TOPS = 55% of that estimate
+- Remaining gap after accounting for clock throttle: ~45%
+- Most likely explanation: enforced power limit < 350W TDP, further reducing
+  sustainable clock beyond what sw_thermal alone would cause.
+
+### Action item:
+Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
+so result.json shows if the card was pre-configured with a non-default limit.
+If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
+
+### CPU/RAM impact on GPU FLOPS:
+None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
+CPU core count and host RAM are irrelevant.
+
+## Compute efficiency metric (proposed, no hardcode)
+
+Instead of comparing TOPS to a hardcoded spec, compute:
+  tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
+
+This is model-agnostic. A GPU computing correctly at its actual frequency
+will show a consistent tops_per_sm_per_ghz regardless of throttle level.
+A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
+normal clocks.
+
+SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
+(needs to be added to queryBenchmarkGPUInfo).
+
+Reference values to establish after baseline runs:
+- H100 PCIe fp16_tensor: TBD tops/SM/GHz
+- H100 SXM fp16_tensor: TBD tops/SM/GHz
+
+## Proposed threshold changes (pending more data)
+
+1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
+   91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
+   capture the root cause.
+
+2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
+   under power cap. Consider suppressing this flag when power is flat and usage
+   is 100% (oscillation is expected). Or lower threshold to 70.
+
+3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
+   ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
+   would have been caught by this).
+
+Decision deferred until baseline on SXM designed servers collected.
--- a/bible-local/docs/gpu-model-propagation.md
+++ b/bible-local/docs/gpu-model-propagation.md
@@ -0,0 +1,117 @@
+# GPU Model Name Propagation
+
+How GPU model names are detected, stored, and displayed throughout the project.
+
+---
+
+## Detection Sources
+
+There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
+
+### Pipeline A — Live / SAT (nvidia-smi query at runtime)
+
+**File:** `audit/internal/platform/sat.go`
+
+- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
+- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
+- Used by: GPU selection UI, live metrics labels, burn/stress test logic
+
+### Pipeline B — Benchmark results
+
+**File:** `audit/internal/platform/benchmark.go`, line 124
+
+- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
+- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
+- Used by: benchmark history table, benchmark report
+
+### Pipeline C — Hardware audit JSON (PCIe schema)
+
+**File:** `audit/internal/schema/hardware.go`
+
+- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
+- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
+- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
+- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
+
+---
+
+## Key Inconsistency: NVIDIA PCIe Model is Never Set
+
+`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
+
+This means:
+- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
+- AMD GPUs do have their model populated
+
+The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
+
+---
+
+## Benchmark History "Unknown GPU" Issue
+
+**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
+
+**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
+
+This happens for:
+- Older result files saved before the `Name` field was added
+- Runs where nvidia-smi query failed before the benchmark started
+
+---
+
+## Fallback Strings — Current State
+
+| Location | File | Fallback string |
+|---|---|---|
+| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
+| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
+| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
+| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
+| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
+| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
+| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
+| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
+
+**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
+
+---
+
+## GPU Selection UI
+
+**File:** `audit/internal/webui/pages.go`
+
+- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
+- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
+- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
+
+This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
+
+---
+
+## Data Flow Summary
+
+```
+nvidia-smi (live)
+  └─ ListNvidiaGPUs() → NvidiaGPU.Name
+       ├─ GPU selection UI (always correct)
+       ├─ Live metrics labels (charts_svg.go)
+       └─ SAT/burn status file (sat.go)
+
+nvidia-smi (at benchmark start)
+  └─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
+       └─ BenchmarkGPUResult.Name (json:"name,omitempty")
+            ├─ Benchmark report
+            └─ Benchmark history table columns
+
+nvidia-smi / lspci (audit collection)
+  └─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
+       └─ Hardware summary page hwDescribeGPU()
+```
+
+---
+
+## What Needs Fixing
+
+1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
+2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
+3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,7 +32,7 @@ lb config noauto \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -36,7 +36,6 @@ typedef void *CUstream;
 #define MAX_CUBLAS_PROFILES 5
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
-#define STRESS_LAUNCH_DEPTH 8

 static const char *ptx_source =
    ".version 6.0\n"
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
    unsigned long iterations = 0;
    int mp_count = 0;
    int stream_count = 1;
-    int launches_per_wave = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "driver-ptx");
@@ -419,44 +417,42 @@ static int run_ptx_fallback(struct cuda_api *api,

    unsigned int threads = 256;

-    double start = now_seconds();
-    double deadline = start + (double)seconds;
+    double deadline = now_seconds() + (double)seconds;
+    double next_sync = now_seconds() + 1.0;
    while (now_seconds() < deadline) {
-        launches_per_wave = 0;
-        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
-            int launched_this_batch = 0;
-            for (int lane = 0; lane < stream_count; lane++) {
-                unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
-                if (!check_rc(api,
-                              "cuLaunchKernel",
-                              api->cuLaunchKernel(kernel,
-                                                  blocks,
-                                                  1,
-                                                  1,
-                                                  threads,
-                                                  1,
-                                                  1,
-                                                  0,
-                                                  streams[lane],
-                                                  params[lane],
-                                                  NULL))) {
-                    goto fail;
-                }
-                launches_per_wave++;
-                launched_this_batch++;
-            }
-            if (launched_this_batch <= 0) {
-                break;
+        int launched = 0;
+        for (int lane = 0; lane < stream_count; lane++) {
+            unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
+            if (!check_rc(api,
+                          "cuLaunchKernel",
+                          api->cuLaunchKernel(kernel,
+                                              blocks,
+                                              1,
+                                              1,
+                                              threads,
+                                              1,
+                                              1,
+                                              0,
+                                              streams[lane],
+                                              params[lane],
+                                              NULL))) {
+                goto fail;
            }
+            launched++;
+            iterations++;
        }
-        if (launches_per_wave <= 0) {
+        if (launched <= 0) {
            goto fail;
        }
-        if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
-            goto fail;
+        double now = now_seconds();
+        if (now >= next_sync || now >= deadline) {
+            if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
+                goto fail;
+            }
+            next_sync = now + 1.0;
        }
-        iterations += (unsigned long)launches_per_wave;
    }
+    api->cuCtxSynchronize();

    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
        goto fail;
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
    report->iterations = iterations;
    snprintf(report->details,
             sizeof(report->details),
-             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
+             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
             size_mb,
             report->buffer_mb,
             report->stream_count,
-             STRESS_LAUNCH_DEPTH,
             bytes_per_stream[0] / (1024u * 1024u),
             iterations);

@@ -606,6 +601,20 @@ struct prepared_profile {
 };

 static const struct profile_desc k_profiles[] = {
+    {
+        "fp64",
+        "fp64",
+        80,
+        1,
+        0,
+        0,
+        8,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUBLAS_COMPUTE_64F,
+    },
    {
        "fp32_tf32",
        "fp32",
@@ -1126,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    int stream_count = 1;
    int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
    int prepared_count = 0;
-    int wave_launches = 0;
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
@@ -1193,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
-                  STRESS_LAUNCH_DEPTH,
                  mp_count,
                  per_profile_budget / (1024u * 1024u));

@@ -1246,50 +1253,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

+    /* Keep the GPU queue continuously full by submitting kernels without
+     * synchronizing after every wave.  A sync barrier after each small batch
+     * creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
+     * especially when individual kernels are short.  Instead we sync at most
+     * once per second (for error detection) and once at the very end. */
    double deadline = now_seconds() + (double)seconds;
+    double next_sync = now_seconds() + 1.0;
    while (now_seconds() < deadline) {
-        wave_launches = 0;
-        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
-            int launched_this_batch = 0;
-            for (int i = 0; i < prepared_count; i++) {
-                if (!prepared[i].ready) {
-                    continue;
-                }
-                if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
-                    append_detail(report->details,
-                                  sizeof(report->details),
-                                  "%s=FAILED runtime\n",
-                                  prepared[i].desc.name);
-                    for (int j = 0; j < prepared_count; j++) {
-                        destroy_profile(&cublas, cuda, &prepared[j]);
-                    }
-                    cublas.cublasLtDestroy(handle);
-                    destroy_streams(cuda, streams, stream_count);
-                    cuda->cuCtxDestroy(ctx);
-                    return 0;
-                }
-                prepared[i].iterations++;
-                report->iterations++;
-                wave_launches++;
-                launched_this_batch++;
+        int launched = 0;
+        for (int i = 0; i < prepared_count; i++) {
+            if (!prepared[i].ready) {
+                continue;
            }
-            if (launched_this_batch <= 0) {
-                break;
+            if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
+                append_detail(report->details,
+                              sizeof(report->details),
+                              "%s=FAILED runtime\n",
+                              prepared[i].desc.name);
+                for (int j = 0; j < prepared_count; j++) {
+                    destroy_profile(&cublas, cuda, &prepared[j]);
+                }
+                cublas.cublasLtDestroy(handle);
+                destroy_streams(cuda, streams, stream_count);
+                cuda->cuCtxDestroy(ctx);
+                return 0;
            }
+            prepared[i].iterations++;
+            report->iterations++;
+            launched++;
        }
-        if (wave_launches <= 0) {
+        if (launched <= 0) {
            break;
        }
-        if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
-            for (int i = 0; i < prepared_count; i++) {
-                destroy_profile(&cublas, cuda, &prepared[i]);
+        double now = now_seconds();
+        if (now >= next_sync || now >= deadline) {
+            if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
+                for (int i = 0; i < prepared_count; i++) {
+                    destroy_profile(&cublas, cuda, &prepared[i]);
+                }
+                cublas.cublasLtDestroy(handle);
+                destroy_streams(cuda, streams, stream_count);
+                cuda->cuCtxDestroy(ctx);
+                return 0;
            }
-            cublas.cublasLtDestroy(handle);
-            destroy_streams(cuda, streams, stream_count);
-            cuda->cuCtxDestroy(ctx);
-            return 0;
+            next_sync = now + 1.0;
        }
    }
+    /* Final drain — ensure all queued work finishes before we read results. */
+    cuda->cuCtxSynchronize();

    for (int i = 0; i < prepared_count; i++) {
        if (!prepared[i].ready) {
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
+            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
            exit 1
            ;;
    esac
 done

 case "$VARIANT" in
-    nvidia|amd|nogpu|all) ;;
-    *) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
+    nvidia|nvidia-legacy|amd|nogpu|all) ;;
+    *) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
 esac

 if [ "$CLEAN_CACHE" = "1" ]; then
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
           "${CACHE_DIR:?}/lb-packages"
    echo "=== cleaning live-build work dirs ==="
    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
+    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
    rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
    rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
    echo "=== caches cleared, proceeding with build ==="
 fi

@@ -180,6 +185,9 @@ case "$VARIANT" in
    nvidia)
        run_variant nvidia
        ;;
+    nvidia-legacy)
+        run_variant nvidia-legacy
+        ;;
    amd)
        run_variant amd
        ;;
@@ -188,6 +196,7 @@ case "$VARIANT" in
        ;;
    all)
        run_variant nvidia
+        run_variant nvidia-legacy
        run_variant amd
        run_variant nogpu
        ;;
--- a/iso/builder/build-nvidia-module.sh
+++ b/iso/builder/build-nvidia-module.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
-# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
+# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
 #
 # Downloads the official NVIDIA .run installer, extracts kernel modules and
-# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
+# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
+#   - open         -> kernel-open/ sources from the .run installer
+#   - proprietary  -> traditional proprietary kernel sources from the .run installer
 #
 # Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
 # are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
@@ -17,10 +19,19 @@ set -e
 NVIDIA_VERSION="$1"
 DIST_DIR="$2"
 DEBIAN_KERNEL_ABI="$3"
+NVIDIA_FLAVOR="${4:-open}"

-[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
-[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
-[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+
+case "$NVIDIA_FLAVOR" in
+    open|proprietary) ;;
+    *)
+        echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
+        exit 1
+        ;;
+esac

 KVER="${DEBIAN_KERNEL_ABI}-amd64"
 # On Debian, kernel headers are split into two packages:
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
 KDIR_ARCH="/usr/src/linux-headers-${KVER}"
 KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"

-echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
+echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="

-if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
-    echo "=== installing linux-headers-${KVER} ==="
-    DEBIAN_FRONTEND=noninteractive apt-get install -y \
-        "linux-headers-${KVER}" \
-        gcc make perl
-fi
-echo "kernel headers (arch):   $KDIR_ARCH"
-echo "kernel headers (common): $KDIR_COMMON"
-
-CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
+CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
 EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
-CACHE_LAYOUT_VERSION="2"
+CACHE_LAYOUT_VERSION="3"
 CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
 if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
        && [ -f "$CACHE_LAYOUT_MARKER" ] \
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
    exit 0
 fi

+if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
+    echo "=== installing linux-headers-${KVER} ==="
+    DEBIAN_FRONTEND=noninteractive apt-get install -y \
+        "linux-headers-${KVER}" \
+        gcc make perl
+fi
+echo "kernel headers (arch):   $KDIR_ARCH"
+echo "kernel headers (common): $KDIR_COMMON"
+
 # Download official NVIDIA .run installer with sha256 verification
 BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
 mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
 rm -rf "$EXTRACT_DIR"
 "$RUN_FILE" --extract-only --target "$EXTRACT_DIR"

-# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
+# Find kernel source directory for the selected flavor.
 KERNEL_SRC=""
-for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
-    [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
-done
-[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
+if [ "$NVIDIA_FLAVOR" = "open" ]; then
+    for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
+        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
+    done
+else
+    for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
+        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
+    done
+fi
+[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
 echo "kernel source: $KERNEL_SRC"

 # Build kernel modules
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -15,26 +15,46 @@ DIST_DIR="${REPO_ROOT}/dist"
 VENDOR_DIR="${REPO_ROOT}/iso/vendor"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 AUTH_KEYS=""
+BUILD_VARIANT="nvidia"
 BEE_GPU_VENDOR="nvidia"
+BEE_NVIDIA_MODULE_FLAVOR="open"

 # parse args
 while [ $# -gt 0 ]; do
    case "$1" in
        --authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
-        --variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
+        --variant) BUILD_VARIANT="$2"; shift 2 ;;
        *) echo "unknown arg: $1"; exit 1 ;;
    esac
 done

-case "$BEE_GPU_VENDOR" in
-    nvidia|amd|nogpu) ;;
-    *) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
+case "$BUILD_VARIANT" in
+    nvidia)
+        BEE_GPU_VENDOR="nvidia"
+        BEE_NVIDIA_MODULE_FLAVOR="open"
+        ;;
+    nvidia-legacy)
+        BEE_GPU_VENDOR="nvidia"
+        BEE_NVIDIA_MODULE_FLAVOR="proprietary"
+        ;;
+    amd)
+        BEE_GPU_VENDOR="amd"
+        BEE_NVIDIA_MODULE_FLAVOR=""
+        ;;
+    nogpu)
+        BEE_GPU_VENDOR="nogpu"
+        BEE_NVIDIA_MODULE_FLAVOR=""
+        ;;
+    *)
+        echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
+        exit 1
+        ;;
 esac

-BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
-OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
+BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
+OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"

-export BEE_GPU_VENDOR
+export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT

 . "${BUILDER_DIR}/VERSIONS"
 export PATH="$PATH:/usr/local/go/bin"
@@ -627,7 +647,7 @@ recover_iso_memtest() {

 AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
 ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
-ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
+ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
 # Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
 OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
 mkdir -p "${OUT_DIR}"
@@ -801,7 +821,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
    apt-get install -y "linux-headers-${KVER}"
 fi

-echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
 echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
 echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
 echo ""
@@ -871,7 +891,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    fi
 fi

-echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
+echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"

 # Sync builder config into variant work dir, preserving lb cache.
@@ -897,6 +917,86 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi

+if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
+    cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
+source /boot/grub/config.cfg
+
+echo ""
+echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
+echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
+echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
+echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
+echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
+echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
+echo "  Hardware Audit LiveCD"
+echo ""
+
+menuentry "EASY-BEE" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
+}
+
+submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — KMS (no nomodeset)" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }
+
+    menuentry "EASY-BEE — fail-safe" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+        initrd  @INITRD_LIVE@
+    }
+}
+
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "Memory Test (memtest86+)" {
+        chainloader /boot/memtest86+x64.efi
+    }
+else
+    menuentry "Memory Test (memtest86+)" {
+        linux16 /boot/memtest86+x64.bin
+    }
+fi
+
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "UEFI Firmware Settings" {
+        fwsetup
+    }
+fi
+EOF
+
+    cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
+label live-@FLAVOUR@-normal
+    menu label ^EASY-BEE
+    menu default
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@
+
+label live-@FLAVOUR@-kms
+    menu label EASY-BEE (^graphics/KMS)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.display=kms
+
+label live-@FLAVOUR@-toram
+    menu label EASY-BEE (^load to RAM)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ toram
+
+label live-@FLAVOUR@-failsafe
+    menu label EASY-BEE (^fail-safe)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+
+label memtest
+    menu label ^Memory Test (memtest86+)
+    linux /boot/memtest86+x64.bin
+EOF
+fi
+
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -981,10 +1081,10 @@ done
 # --- NVIDIA kernel modules and userspace libs ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
-        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
+        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"

    KVER="${DEBIAN_KERNEL_ABI}-amd64"
-    NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
+    NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"

    # Inject .ko files into overlay at /usr/local/lib/nvidia/
    OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
@@ -1055,13 +1155,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u

 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
+NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
 NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
 CUBLAS_VERSION=${CUBLAS_VERSION}
 CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
 NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
 JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
-    GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
+    GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
    GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
    GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
@@ -1073,6 +1174,7 @@ fi
 cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
 BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
 BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
+BEE_BUILD_VARIANT=${BUILD_VARIANT}
 BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
 BUILD_DATE=${BUILD_DATE}
 GIT_COMMIT=${GIT_COMMIT}
@@ -1083,6 +1185,11 @@ EOF

 # Write GPU vendor marker for hooks
 echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
+if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
+    echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
+else
+    rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
+fi

 # Patch motd with build info
 BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
@@ -1153,10 +1260,10 @@ fi

 # --- build ISO using live-build ---
 echo ""
-echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="

 # Export for auto/config
-BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
+BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER

 cd "${LB_DIR}"
@@ -1191,7 +1298,7 @@ if [ -f "$ISO_RAW" ]; then
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
-    echo "=== done (${BEE_GPU_VENDOR}) ==="
+    echo "=== done (${BUILD_VARIANT}) ==="
    echo "ISO: $ISO_OUT"
    if command -v stat >/dev/null 2>&1; then
        ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -15,29 +15,21 @@ menuentry "EASY-BEE" {
    initrd  @INITRD_LIVE@
 }

-menuentry "EASY-BEE (graphics/KMS)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
+submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — GSP=off" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }

-menuentry "EASY-BEE (load to RAM)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
+    menuentry "EASY-BEE — KMS (no nomodeset)" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }

-menuentry "EASY-BEE (NVIDIA GSP=off)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-menuentry "EASY-BEE (fail-safe)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-    initrd  @INITRD_LIVE@
+    menuentry "EASY-BEE — fail-safe" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+        initrd  @INITRD_LIVE@
+    }
 }

 if [ "${grub_platform}" = "efi" ]; then
--- a/iso/builder/config/bootloaders/grub-pc/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/theme.cfg
@@ -1,9 +1,9 @@
 set color_normal=light-gray/black
-set color_highlight=white/dark-gray
+set color_highlight=yellow/black

 if [ -e /boot/grub/splash.png ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
-    set menu_color_normal=cyan/black
-    set menu_color_highlight=white/dark-gray
+    set menu_color_normal=yellow/black
+    set menu_color_highlight=white/brown
 fi
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -0,0 +1,117 @@
+#!/bin/sh
+# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
+set -e
+echo "=== generating bee wallpaper ==="
+mkdir -p /usr/share/bee
+
+python3 - <<'PYEOF'
+from PIL import Image, ImageDraw, ImageFont, ImageFilter
+import os
+
+W, H = 1920, 1080
+
+ASCII_ART = [
+    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
+    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
+    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
+    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
+    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
+    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
+]
+SUBTITLE = "  Hardware Audit LiveCD"
+
+FG = (0xF6, 0xD0, 0x47)
+FG_DIM = (0xD4, 0xA9, 0x1C)
+SHADOW = (0x5E, 0x47, 0x05)
+SUB = (0x96, 0x7A, 0x17)
+BG = (0x05, 0x05, 0x05)
+
+MONO_FONT_CANDIDATES = [
+    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
+    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
+]
+SUB_FONT_CANDIDATES = [
+    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
+    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
+]
+
+
+def load_font(candidates, size):
+    for path in candidates:
+        if os.path.exists(path):
+            return ImageFont.truetype(path, size)
+    return ImageFont.load_default()
+
+
+def mono_metrics(font):
+    probe = Image.new('L', (W, H), 0)
+    draw = ImageDraw.Draw(probe)
+    char_w = int(round(draw.textlength("M", font=font)))
+    bb = draw.textbbox((0, 0), "Mg", font=font)
+    char_h = bb[3] - bb[1]
+    return char_w, char_h
+
+
+def render_ascii_mask(font, lines, char_w, char_h, line_gap):
+    width = max(len(line) for line in lines) * char_w
+    height = len(lines) * char_h + line_gap * (len(lines) - 1)
+    mask = Image.new('L', (width, height), 0)
+    draw = ImageDraw.Draw(mask)
+    for row, line in enumerate(lines):
+        y = row * (char_h + line_gap)
+        for col, ch in enumerate(line):
+            if ch == ' ':
+                continue
+            x = col * char_w
+            draw.text((x, y), ch, font=font, fill=255)
+    return mask
+
+
+img = Image.new('RGB', (W, H), BG)
+draw = ImageDraw.Draw(img)
+
+# Soft amber glow under the logo without depending on font rendering.
+glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
+glow_draw = ImageDraw.Draw(glow)
+glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
+glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
+glow = glow.filter(ImageFilter.GaussianBlur(60))
+img = Image.alpha_composite(img.convert('RGBA'), glow)
+
+TARGET_LOGO_W = 400
+max_chars = max(len(line) for line in ASCII_ART)
+_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
+_probe_cw, _ = mono_metrics(_probe_font)
+font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
+font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
+char_w, char_h = mono_metrics(font_logo)
+logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
+logo_w, logo_h = logo_mask.size
+logo_x = (W - logo_w) // 2
+logo_y = 380
+
+sh_off = max(1, font_size_logo // 6)
+shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
+img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
+img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
+img.paste(FG, (logo_x, logo_y), logo_mask)
+
+font_sub = load_font(SUB_FONT_CANDIDATES, 30)
+sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
+sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
+sub_y = logo_y + logo_h + 48
+draw = ImageDraw.Draw(img)
+draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
+draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
+
+img = img.convert('RGB')
+
+img.save('/usr/share/bee/wallpaper.png', optimize=True)
+print('wallpaper written: /usr/share/bee/wallpaper.png')
+PYEOF
+
+echo "=== wallpaper done ==="
--- a/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
+++ b/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
@@ -0,0 +1,41 @@
+#!/bin/sh
+# 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
+#
+# live-boot tries "losetup --replace --direct-io=on" when re-associating the
+# loop device to the RAM copy in /dev/shm.  tmpfs does not support O_DIRECT,
+# so the ioctl returns EINVAL and the verification step fails.
+#
+# The patch replaces the replace call so that if --direct-io=on fails it falls
+# back to a plain replace without direct-io, and also relaxes the verification
+# to a warning so the boot continues even when re-association is imperfect.
+set -e
+
+TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
+
+if [ ! -f "${TORAM_SCRIPT}" ]; then
+    echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
+    exit 0
+fi
+
+echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
+
+# Replace any losetup --replace call that includes --direct-io=on with a
+# version that first tries with direct-io, then retries without it.
+#
+# The sed expression turns:
+#   losetup --replace ... --direct-io=on LOOP FILE
+# into a shell snippet that tries both, silently.
+#
+# We also downgrade the fatal "Task finished with error." block to a warning
+# so the boot continues if re-association fails (squashfs still accessible).
+
+# 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
+sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
+sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
+
+# 2. Turn the hard error into a warning so boot continues.
+#    live-boot prints this exact string when verification fails.
+sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
+
+echo "9010-fix-toram: patch applied"
+grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -60,9 +60,15 @@ qrencode
 # Local desktop (openbox + chromium kiosk)
 openbox
 tint2
+feh
+python3-pil
 xorg
 xterm
 chromium
+mousepad
+pcmanfm
+ristretto
+mupdf
 xserver-xorg-video-fbdev
 xserver-xorg-video-vesa
 lightdm
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -27,6 +27,7 @@ echo ""
 KVER=$(uname -r)
 info "kernel: $KVER"
 NVIDIA_BOOT_MODE="normal"
+NVIDIA_MODULES_FLAVOR="proprietary"
 for arg in $(cat /proc/cmdline 2>/dev/null); do
    case "$arg" in
        bee.nvidia.mode=*)
@@ -34,7 +35,11 @@ for arg in $(cat /proc/cmdline 2>/dev/null); do
            ;;
    esac
 done
+if [ -f /etc/bee-nvidia-modules-flavor ]; then
+    NVIDIA_MODULES_FLAVOR="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null || echo proprietary)"
+fi
 info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
+info "nvidia modules flavor: ${NVIDIA_MODULES_FLAVOR}"

 # --- PATH & binaries ---
 echo "-- PATH & binaries --"
@@ -110,10 +115,12 @@ fi
 for mod in nvidia_modeset nvidia_uvm; do
    if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
        ok "module loaded: $mod"
-    elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+    elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
        fail "module NOT loaded in normal mode: $mod"
-    else
+    elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
        warn "module not loaded in GSP-off mode: $mod"
+    else
+        fail "module NOT loaded: $mod"
    fi
 done

@@ -129,10 +136,12 @@ done

 if [ -e /dev/nvidia-uvm ]; then
    ok "/dev/nvidia-uvm exists"
-elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
    fail "/dev/nvidia-uvm missing in normal mode"
-else
+elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
    warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
+else
+    fail "/dev/nvidia-uvm missing"
 fi

 echo ""
--- a/iso/overlay/etc/systemd/system/bee-boot-status.service
+++ b/iso/overlay/etc/systemd/system/bee-boot-status.service
@@ -1,6 +1,5 @@
 [Unit]
 Description=Bee: boot status display
-DefaultDependencies=no
 After=systemd-user-sessions.service
 Before=getty@tty1.service

@@ -12,6 +11,8 @@ TTYPath=/dev/tty1
 StandardInput=tty
 StandardOutput=tty
 StandardError=tty
+TTYReset=yes
+TTYVHangup=yes

 [Install]
 WantedBy=multi-user.target
--- a/iso/overlay/usr/local/bin/bee-boot-status
+++ b/iso/overlay/usr/local/bin/bee-boot-status
@@ -2,25 +2,43 @@
 # bee-boot-status — boot progress display on tty1.
 # Shows live service status until all bee services are done or failed,
 # then exits so getty can show the login prompt.
-# GUI (lightdm) starts independently without waiting for this.

-# Services to wait for before handing off to login prompt.
 CRITICAL="bee-preflight bee-nvidia bee-audit"
-# Additional services shown for information only.
-ALL="bee-preflight bee-network bee-nvidia bee-audit bee-web"
+ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
+
+svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }

 svc_icon() {
-    case "$(systemctl is-active "$1.service" 2>/dev/null)" in
-        active)     printf '\033[32m[  OK  ]\033[0m' ;;
-        failed)     printf '\033[31m[ FAIL ]\033[0m' ;;
-        activating) printf '\033[33m[  ..  ]\033[0m' ;;
-        *)          printf        '[      ]'          ;;
+    case "$(svc_state "$1")" in
+        active)       printf '\033[32m[  OK  ]\033[0m' ;;
+        failed)       printf '\033[31m[ FAIL ]\033[0m' ;;
+        activating)   printf '\033[33m[  ..  ]\033[0m' ;;
+        deactivating) printf '\033[33m[ stop ]\033[0m' ;;
+        inactive)     printf '\033[90m[      ]\033[0m' ;;
+        *)            printf '\033[90m[  ?   ]\033[0m' ;;
+    esac
+}
+
+svc_detail() {
+    local svc="$1" state
+    state="$(svc_state "$svc")"
+    case "$state" in
+        failed)
+            local res
+            res="$(systemctl show -p Result "$svc.service" 2>/dev/null | cut -d= -f2)"
+            [ -n "$res" ] && [ "$res" != "success" ] && printf '  \033[31m(%s)\033[0m' "$res"
+            ;;
+        activating)
+            local line
+            line="$(journalctl -u "$svc.service" -n 1 --no-pager --output=cat 2>/dev/null | cut -c1-55)"
+            [ -n "$line" ] && printf '  \033[90m%s\033[0m' "$line"
+            ;;
    esac
 }

 all_critical_done() {
    for svc in $CRITICAL; do
-        case "$(systemctl is-active "$svc.service" 2>/dev/null)" in
+        case "$(svc_state "$svc")" in
            active|failed|inactive) ;;
            *) return 1 ;;
        esac
@@ -29,7 +47,9 @@ all_critical_done() {
 }

 while true; do
+    # move to top-left and clear screen
    printf '\033[H\033[2J'
+
    printf '\n'
    printf '  \033[33m███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗\033[0m\n'
    printf '  \033[33m██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝\033[0m\n'
@@ -39,18 +59,31 @@ while true; do
    printf '  \033[33m╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝\033[0m\n'
    printf '  Hardware Audit LiveCD\n'
    printf '\n'
+
    for svc in $ALL; do
-        printf '  %s  %s\n' "$(svc_icon $svc)" "$svc"
+        printf '  %s  %-20s%s\n' "$(svc_icon "$svc")" "$svc" "$(svc_detail "$svc")"
    done
    printf '\n'

-    if all_critical_done; then
-        printf '  \033[1mSystem ready.\033[0m  Audit is running in the background.\n'
-        printf '  Web UI will be available at \033[1mhttp://<ip>/\033[0m when done.\n'
+    # Network
+    ips="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{printf "  %-16s %s\n", $NF, $2}')"
+    if [ -n "$ips" ]; then
+        printf '  \033[1mNetwork:\033[0m\n'
+        printf '%s\n' "$ips"
        printf '\n'
-        sleep 2
+    fi
+
+    if all_critical_done; then
+        printf '  \033[1;32mSystem ready.\033[0m  Audit is running in the background.\n'
+        first_ip="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1 | head -1)"
+        if [ -n "$first_ip" ]; then
+            printf '  Web UI: \033[1mhttp://%s/\033[0m\n' "$first_ip"
+        fi
+        printf '\n'
+        sleep 3
        break
    fi

-    sleep 1
+    printf '  \033[90mStarting up...\033[0m\n'
+    sleep 3
 done
--- a/iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
+++ b/iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
@@ -0,0 +1,110 @@
+#!/bin/sh
+set -eu
+
+SECONDS=300
+STAGGER_SECONDS=180
+DEVICES=""
+EXCLUDE=""
+
+usage() {
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
+    exit 2
+}
+
+normalize_list() {
+    echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
+}
+
+contains_csv() {
+    needle="$1"
+    haystack="${2:-}"
+    echo ",${haystack}," | grep -q ",${needle},"
+}
+
+resolve_dcgmproftester() {
+    for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
+        if command -v "${candidate}" >/dev/null 2>&1; then
+            command -v "${candidate}"
+            return 0
+        fi
+    done
+    return 1
+}
+
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
+        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
+        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+
+PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
+ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
+[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
+
+DEVICES=$(normalize_list "${DEVICES}")
+EXCLUDE=$(normalize_list "${EXCLUDE}")
+SELECTED="${DEVICES}"
+if [ -z "${SELECTED}" ]; then
+    SELECTED="${ALL_DEVICES}"
+fi
+
+FINAL=""
+for id in $(echo "${SELECTED}" | tr ',' ' '); do
+    [ -n "${id}" ] || continue
+    if contains_csv "${id}" "${EXCLUDE}"; then
+        continue
+    fi
+    if [ -z "${FINAL}" ]; then
+        FINAL="${id}"
+    else
+        FINAL="${FINAL},${id}"
+    fi
+done
+
+[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
+
+echo "loader=dcgmproftester-staggered"
+echo "selected_gpus=${FINAL}"
+echo "stagger_seconds=${STAGGER_SECONDS}"
+
+TMP_DIR=$(mktemp -d)
+trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
+
+GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
+gpu_pos=0
+WORKERS=""
+for id in $(echo "${FINAL}" | tr ',' ' '); do
+    gpu_pos=$((gpu_pos + 1))
+    log="${TMP_DIR}/gpu-${id}.log"
+    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
+    gpu_seconds=$(( SECONDS + extra_sec ))
+    echo "starting gpu ${id} seconds=${gpu_seconds}"
+    CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
+    pid=$!
+    WORKERS="${WORKERS} ${pid}:${id}:${log}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
+done
+
+status=0
+for spec in ${WORKERS}; do
+    pid=${spec%%:*}
+    rest=${spec#*:}
+    id=${rest%%:*}
+    log=${rest#*:}
+    if wait "${pid}"; then
+        echo "gpu ${id} finished: OK"
+    else
+        rc=$?
+        echo "gpu ${id} finished: FAILED rc=${rc}"
+        status=1
+    fi
+    sed "s/^/[gpu ${id}] /" "${log}" || true
+done
+
+exit "${status}"
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -2,13 +2,14 @@
 set -eu

 SECONDS=5
+STAGGER_SECONDS=0
 SIZE_MB=0
 DEVICES=""
 EXCLUDE=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"

 usage() {
-    echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
    exit 2
 }

@@ -25,6 +26,7 @@ contains_csv() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
@@ -61,12 +63,18 @@ done

 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"
+echo "stagger_seconds=${STAGGER_SECONDS}"
+
+export CUDA_DEVICE_ORDER="PCI_BUS_ID"

 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM

+GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
+gpu_pos=0
 WORKERS=""
 for id in $(echo "${FINAL}" | tr ',' ' '); do
+    gpu_pos=$((gpu_pos + 1))
    log="${TMP_DIR}/gpu-${id}.log"
    gpu_size_mb="${SIZE_MB}"
    if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
@@ -77,10 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
            gpu_size_mb=512
        fi
    fi
-    echo "starting gpu ${id} size=${gpu_size_mb}MB"
-    "${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
+    gpu_seconds=$(( SECONDS + extra_sec ))
+    echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
+    CUDA_VISIBLE_DEVICES="${id}" \
+        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
 done

 status=0
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -2,6 +2,7 @@
 set -eu

 DURATION_SEC=300
+STAGGER_SECONDS=0
 DEVICES=""
 EXCLUDE=""
 FORMAT=""
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
 export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"

 usage() {
-    echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
    exit 2
 }

@@ -118,6 +119,7 @@ ensure_opencl_ready() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -152,19 +154,25 @@ done

 [ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+export CUDA_VISIBLE_DEVICES="${FINAL}"
+
 JOHN_DEVICES=""
+local_id=1
 for id in $(echo "${FINAL}" | tr ',' ' '); do
-    opencl_id=$((id + 1))
+    opencl_id="${local_id}"
    if [ -z "${JOHN_DEVICES}" ]; then
        JOHN_DEVICES="${opencl_id}"
    else
        JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
    fi
+    local_id=$((local_id + 1))
 done

 echo "loader=john"
 echo "selected_gpus=${FINAL}"
 echo "john_devices=${JOHN_DEVICES}"
+echo "stagger_seconds=${STAGGER_SECONDS}"

 cd "${JOHN_DIR}"

@@ -227,14 +235,21 @@ trap cleanup EXIT INT TERM
 echo "format=${CHOSEN_FORMAT}"
 echo "target_seconds=${DURATION_SEC}"
 echo "slice_seconds=${TEST_SLICE_SECONDS}"
-DEADLINE=$(( $(date +%s) + DURATION_SEC ))
+TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
 _first=1
+pos=0
 for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
+    pos=$((pos + 1))
    [ "${_first}" = "1" ] || sleep 3
    _first=0
-    run_john_loop "${opencl_id}" "${DEADLINE}" &
+    extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
+    deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
+    run_john_loop "${opencl_id}" "${deadline}" &
    pid=$!
    PIDS="${PIDS} ${pid}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
 done
 FAIL=0
 for pid in ${PIDS}; do
--- a/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
 echo "range=${MIN_BYTES}..${MAX_BYTES}"
 echo "iters=${ITERS}"

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+
 deadline=$(( $(date +%s) + SECONDS ))
 round=0

--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -6,10 +6,28 @@ NVIDIA_KO_DIR="/usr/local/lib/nvidia"

 log() { echo "[bee-nvidia] $*"; }

+read_nvidia_modules_flavor() {
+    if [ -f /etc/bee-nvidia-modules-flavor ]; then
+        flavor="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null)"
+        case "$flavor" in
+            open|proprietary)
+                echo "$flavor"
+                return 0
+                ;;
+        esac
+    fi
+    echo "proprietary"
+}
+
 log "kernel: $(uname -r)"

-# Skip if no NVIDIA GPU present (PCI vendor 10de)
-if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
+# Skip if no NVIDIA display/compute GPU is present.
+# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
+have_nvidia_gpu() {
+    lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
+}
+
+if ! have_nvidia_gpu; then
    log "no NVIDIA GPU detected — skipping module load"
    exit 0
 fi
@@ -40,6 +58,8 @@ if [ -z "$nvidia_mode" ]; then
    nvidia_mode="normal"
 fi
 log "boot mode: $nvidia_mode"
+nvidia_modules_flavor="$(read_nvidia_modules_flavor)"
+log "modules flavor: $nvidia_modules_flavor"

 load_module() {
    mod="$1"
@@ -50,11 +70,93 @@ load_module() {
        log "WARN: not found: $ko"
        return 1
    fi
-    if insmod "$ko" "$@"; then
+    if timeout 90 insmod "$ko" "$@"; then
        log "loaded: $mod $*"
        return 0
    fi
-    log "WARN: failed to load: $mod"
+    log "WARN: failed to load: $mod (exit $?)"
+    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
+    return 1
+}
+
+nvidia_is_functional() {
+    grep -q ' nvidiactl$' /proc/devices 2>/dev/null
+}
+
+load_module_with_gsp_fallback() {
+    ko="$NVIDIA_KO_DIR/nvidia.ko"
+    if [ ! -f "$ko" ]; then
+        log "ERROR: not found: $ko"
+        return 1
+    fi
+
+    # Run insmod in background — on some converted SXM→PCIe cards GSP enters an
+    # infinite crash/reload loop and insmod never returns. We check for successful
+    # initialization by polling /proc/devices for nvidiactl instead of waiting for
+    # insmod to exit.
+    log "loading nvidia (GSP enabled, timeout 90s)"
+    insmod "$ko" &
+    _insmod_pid=$!
+
+    _waited=0
+    while [ $_waited -lt 90 ]; do
+        if nvidia_is_functional; then
+            log "loaded: nvidia (GSP enabled, ${_waited}s)"
+            echo "gsp-on" > /run/bee-nvidia-mode
+            return 0
+        fi
+        # Check if insmod exited with an error before timeout
+        if ! kill -0 "$_insmod_pid" 2>/dev/null; then
+            wait "$_insmod_pid"
+            _rc=$?
+            if [ $_rc -ne 0 ]; then
+                log "nvidia load failed (exit $_rc)"
+                dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
+                return 1
+            fi
+            # insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
+            sleep 2
+            if nvidia_is_functional; then
+                log "loaded: nvidia (GSP enabled, ${_waited}s)"
+                return 0
+            fi
+            log "insmod exited 0 but nvidiactl missing — treating as failure"
+            return 1
+        fi
+        sleep 1
+        _waited=$((_waited + 1))
+    done
+
+    # GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
+    log "nvidia GSP init timed out after 90s"
+    kill "$_insmod_pid" 2>/dev/null || true
+    wait "$_insmod_pid" 2>/dev/null || true
+
+    # Attempt to unload the partially-initialized module
+    if ! rmmod nvidia 2>/dev/null; then
+        # Module is stuck in the kernel — cannot reload with different params.
+        # User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
+        log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
+        log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
+        echo "gsp-stuck" > /run/bee-nvidia-mode
+        return 1
+    fi
+
+    sleep 2
+    log "retrying with NVreg_EnableGpuFirmware=0"
+    log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
+
+    if insmod "$ko" NVreg_EnableGpuFirmware=0; then
+        if nvidia_is_functional; then
+            log "loaded: nvidia (GSP disabled)"
+            echo "gsp-off" > /run/bee-nvidia-mode
+            return 0
+        fi
+        log "insmod gsp-off exited 0 but nvidiactl missing"
+        return 1
+    fi
+
+    log "nvidia load failed (GSP=off)"
    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
    return 1
 }
@@ -68,37 +170,54 @@ load_host_module() {
    return 1
 }

-case "$nvidia_mode" in
-    normal|full)
-        if ! load_module nvidia; then
-            exit 1
-        fi
-        # nvidia-modeset on some server kernels needs ACPI video helper symbols
-        # exported by the generic "video" module. Best-effort only; compute paths
-        # remain functional even if display-related modules stay absent.
-        load_host_module video || true
-        load_module nvidia-modeset || true
-        load_module nvidia-uvm || true
-        ;;
-    gsp-off|safe)
-        # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
-        # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
-        # conservative path for platforms where full boot-time GSP init is unstable.
-        if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
-            exit 1
-        fi
-        log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
-        ;;
-    nomsi|*)
-        # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
-        # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
-        # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
-        if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
-            exit 1
-        fi
-        log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
-        ;;
-esac
+if [ "$nvidia_modules_flavor" = "open" ]; then
+    case "$nvidia_mode" in
+        gsp-off|safe|nomsi)
+            log "ignoring boot mode ${nvidia_mode} for open NVIDIA modules"
+            ;;
+    esac
+    if ! load_module nvidia; then
+        exit 1
+    fi
+    # nvidia-modeset on some server kernels needs ACPI video helper symbols
+    # exported by the generic "video" module. Best-effort only; compute paths
+    # remain functional even if display-related modules stay absent.
+    load_host_module video || true
+    load_module nvidia-modeset || true
+    load_module nvidia-uvm || true
+else
+    case "$nvidia_mode" in
+        normal|full)
+            if ! load_module_with_gsp_fallback; then
+                exit 1
+            fi
+            # nvidia-modeset on some server kernels needs ACPI video helper symbols
+            # exported by the generic "video" module. Best-effort only; compute paths
+            # remain functional even if display-related modules stay absent.
+            load_host_module video || true
+            load_module nvidia-modeset || true
+            load_module nvidia-uvm || true
+            ;;
+        gsp-off|safe)
+            # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
+            # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
+            # conservative path for platforms where full boot-time GSP init is unstable.
+            if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
+                exit 1
+            fi
+            log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
+            ;;
+        nomsi|*)
+            # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
+            # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
+            # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
+            if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
+                exit 1
+            fi
+            log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
+            ;;
+    esac
+fi

 # Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
 nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
@@ -127,6 +246,18 @@ fi
 ldconfig 2>/dev/null || true
 log "ldconfig refreshed"

+# Keep persistence mode enabled across the session so dcgmi / stress tools do
+# not fail with deployment warnings on otherwise healthy GPUs.
+if command -v nvidia-smi >/dev/null 2>&1; then
+    if nvidia-smi -pm 1 >/dev/null 2>&1; then
+        log "enabled NVIDIA persistence mode"
+    else
+        log "WARN: failed to enable NVIDIA persistence mode"
+    fi
+else
+    log "WARN: nvidia-smi not found — cannot enable persistence mode"
+fi
+
 # Start DCGM host engine so dcgmi can discover GPUs.
 # nv-hostengine must run after the NVIDIA modules and device nodes are ready.
 # If it started too early (for example via systemd before bee-nvidia-load), it can
--- a/iso/overlay/usr/local/bin/bee-openbox-session
+++ b/iso/overlay/usr/local/bin/bee-openbox-session
@@ -7,8 +7,25 @@ xset s off
 xset -dpms
 xset s noblank

+# Set desktop background.
+if [ -f /usr/share/bee/wallpaper.png ]; then
+    feh --bg-fill /usr/share/bee/wallpaper.png
+else
+    xsetroot -solid '#f6c90e'
+fi
+
 tint2 &

+# Wait up to 60s for bee-web before opening Chromium.
+# Without this Chromium gets connection-refused and shows a blank page.
+_i=0
+while [ $_i -lt 60 ]; do
+    curl -sf http://localhost/healthz >/dev/null 2>&1 && break
+    sleep 1
+    _i=$((_i+1))
+done
+unset _i
+
 chromium \
    --disable-infobars \
    --disable-translate \
--- a/iso/overlay/usr/local/bin/bee-selfheal
+++ b/iso/overlay/usr/local/bin/bee-selfheal
@@ -14,7 +14,7 @@ log() {
 }

 have_nvidia_gpu() {
-    lspci -nn 2>/dev/null | grep -qi '10de:'
+    lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
 }

 service_active() {