Add resilient HPL source fallbacks

Restore MOTD-style ASCII wallpaper
Update bible submodule
2026-04-08 09:25:31 +03:00 · 2026-04-08 09:14:27 +03:00 · 2026-04-08 07:14:31 +03:00 · 2026-04-08 07:08:18 +03:00 · 2026-04-08 00:42:12 +03:00 · 2026-04-08 00:25:12 +03:00
64 changed files with 7156 additions and 882 deletions
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 			archive, err = application.RunNvidiaAcceptancePack("", logLine)
 		}
 	case "memory":
-		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
+		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
 	case "storage":
-		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
+		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
 	case "cpu":
 		dur := *duration
 		if dur <= 0 {
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -122,8 +122,10 @@ type satRunner interface {
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
-	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
-	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
+	ResetNvidiaGPU(index int) (string, error)
+	RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
+	RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
 	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
 	DetectGPUVendor() string
@@ -137,6 +139,7 @@ type satRunner interface {
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
 	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error)
 }

 type runtimeChecker interface {
@@ -521,6 +524,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return a.sat.ListNvidiaGPUs()
 }

+func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
+	return a.sat.ListNvidiaGPUStatuses()
+}
+
+func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
+	out, err := a.sat.ResetNvidiaGPU(index)
+	return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
+}
+
 func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -591,14 +603,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
 }

 func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
-	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
+	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
 }

-func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
+	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
 }

 func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
@@ -623,14 +635,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
 }

 func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
-	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
+	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
 }

-func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
+	return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
 }

 func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
@@ -726,6 +738,13 @@ func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
 	return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
 }

+func (a *App) RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error) {
+	if a == nil {
+		return "", nil, fmt.Errorf("app not configured")
+	}
+	return a.sat.RunHPL(ctx, baseDir, opts, logFunc)
+}
+
 func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
 	path, err := a.RunFanStressTest(ctx, "", opts)
 	body := formatFanStressResult(path)
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -135,6 +135,8 @@ type fakeSAT struct {
 	listAMDGPUsFn             func() ([]platform.AMDGPUInfo, error)
 	runAMDPackFn              func(string) (string, error)
 	listNvidiaGPUsFn          func() ([]platform.NvidiaGPU, error)
+	listNvidiaGPUStatusesFn   func() ([]platform.NvidiaGPUStatus, error)
+	resetNvidiaGPUFn          func(int) (string, error)
 }

 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -201,11 +203,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return nil, nil
 }

-func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
+	if f.listNvidiaGPUStatusesFn != nil {
+		return f.listNvidiaGPUStatusesFn()
+	}
+	return nil, nil
+}
+
+func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
+	if f.resetNvidiaGPUFn != nil {
+		return f.resetNvidiaGPUFn(index)
+	}
+	return "", nil
+}
+
+func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
 	return f.runMemoryFn(baseDir)
 }

-func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
 	return f.runStorageFn(baseDir)
 }

@@ -266,6 +282,9 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
 	return "", nil
 }
+func (f fakeSAT) RunHPL(_ context.Context, _ string, _ platform.HPLOptions, _ func(string)) (string, *platform.HPLResult, error) {
+	return "", nil, nil
+}

 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()
@@ -805,6 +824,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	for _, want := range []string{
 		"/system/ip-link.txt",
 		"/system/ip-link-stats.txt",
+		"/system/kernel-aer-nvidia.txt",
+		"/system/lspci-nvidia-bridges-vv.txt",
+		"/system/pcie-aer-sysfs.txt",
 		"/system/ethtool-info.txt",
 		"/system/ethtool-link.txt",
 		"/system/ethtool-module.txt",
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -3,6 +3,7 @@ package app
 import (
 	"os"
 	"path/filepath"
+	"strconv"
 	"sort"
 	"strings"

@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
 		applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
+		applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
 		applyMemorySAT(snap.Memory, summary)
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
 	applyComponentStatusDB(snap, db)
 }

+type nvidiaPerGPUStatus struct {
+	runStatus string
+	reason    string
+}
+
+func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
+	statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
+	if !ok {
+		return
+	}
+	for i := range devs {
+		if devs[i].Telemetry == nil {
+			continue
+		}
+		rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
+		if !ok {
+			continue
+		}
+		idx, ok := telemetryInt(rawIdx)
+		if !ok {
+			continue
+		}
+		st, ok := statusByIndex[idx]
+		if !ok {
+			continue
+		}
+		status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
+		if !ok {
+			continue
+		}
+		mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
+	}
+}
+
+func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
+	matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
+	if err != nil || len(matches) == 0 {
+		return nil, "", false
+	}
+	sort.Strings(matches)
+	runDir := matches[len(matches)-1]
+	summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return nil, "", false
+	}
+	summaryKV := parseKeyValueSummary(string(summaryRaw))
+	runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
+	files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
+	if err != nil || len(files) == 0 {
+		return nil, "", false
+	}
+	out := make(map[int]nvidiaPerGPUStatus, len(files))
+	for _, file := range files {
+		raw, err := os.ReadFile(file)
+		if err != nil {
+			continue
+		}
+		kv := parseKeyValueSummary(string(raw))
+		idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
+		if err != nil {
+			continue
+		}
+		out[idx] = nvidiaPerGPUStatus{
+			runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
+			reason:    strings.TrimSpace(kv["reason"]),
+		}
+	}
+	if len(out) == 0 {
+		return nil, "", false
+	}
+	return out, runAtUTC, true
+}
+
+func telemetryInt(v any) (int, bool) {
+	switch value := v.(type) {
+	case int:
+		return value, true
+	case int32:
+		return int(value), true
+	case int64:
+		return int(value), true
+	case float64:
+		return int(value), true
+	case string:
+		n, err := strconv.Atoi(strings.TrimSpace(value))
+		if err != nil {
+			return 0, false
+		}
+		return n, true
+	default:
+		return 0, false
+	}
+}
+
 type satSummary struct {
 	runAtUTC string
 	overall  string
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
 	}
 }

+func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
+	if component == nil || satStatus == "" {
+		return
+	}
+	current := strings.TrimSpace(ptrString(component.Status))
+	newSeverity := statusSeverity(satStatus)
+	currentSeverity := statusSeverity(current)
+	if current == "" || current == "Unknown" || newSeverity > currentSeverity {
+		mergeComponentStatus(component, changedAt, satStatus, description)
+		return
+	}
+	if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
+		component.Status = appStringPtr(satStatus)
+		component.ErrorDescription = appStringPtr(description)
+		if strings.TrimSpace(changedAt) != "" {
+			component.StatusChangedAt = appStringPtr(changedAt)
+			component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
+				Status:    satStatus,
+				ChangedAt: changedAt,
+				Details:   appStringPtr(description),
+			})
+		}
+	}
+}
+
 func statusSeverity(status string) int {
 	switch strings.TrimSpace(status) {
 	case "Critical":
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
 	}
 }
+
+func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
+	baseDir := t.TempDir()
+	runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	class := "VideoController"
+	manufacturer := "NVIDIA Corporation"
+	bdf0 := "0000:4b:00.0"
+	bdf1 := "0000:4f:00.0"
+	snap := schema.HardwareSnapshot{
+		PCIeDevices: []schema.HardwarePCIeDevice{
+			{
+				DeviceClass:  &class,
+				Manufacturer: &manufacturer,
+				BDF:          &bdf0,
+				Telemetry:    map[string]any{"nvidia_gpu_index": 0},
+			},
+			{
+				DeviceClass:  &class,
+				Manufacturer: &manufacturer,
+				BDF:          &bdf1,
+				Telemetry:    map[string]any{"nvidia_gpu_index": 1},
+			},
+		},
+	}
+
+	applyLatestSATStatuses(&snap, baseDir, nil)
+
+	if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
+		t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
+	}
+	if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
+		got := "<nil>"
+		if snap.PCIeDevices[1].ErrorDescription != nil {
+			got = *snap.PCIeDevices[1].ErrorDescription
+		}
+		t.Fatalf("gpu1 error=%q want per-gpu reason", got)
+	}
+}
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -40,7 +40,36 @@ var supportBundleCommands = []struct {
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
 	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
+	{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
+if command -v dmesg >/dev/null 2>&1; then
+  dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
+else
+  echo "dmesg not found"
+fi
+`}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
+if ! command -v lspci >/dev/null 2>&1; then
+  echo "lspci not found"
+  exit 0
+fi
+found=0
+for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
+  found=1
+  echo "=== GPU $gpu ==="
+  lspci -s "$gpu" -vv 2>&1 || true
+  bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
+  if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
+    echo
+    echo "=== UPSTREAM $bridge for $gpu ==="
+    lspci -s "$bridge" -vv 2>&1 || true
+  fi
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no NVIDIA PCI devices found"
+fi
+`}},
 	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
 for d in /sys/bus/pci/devices/*/; do
  vendor=$(cat "$d/vendor" 2>/dev/null)
@@ -51,6 +80,30 @@ for d in /sys/bus/pci/devices/*/; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
  done
 done
+`}},
+	{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
+found=0
+for dev in /sys/bus/pci/devices/*; do
+  [ -e "$dev" ] || continue
+  bdf=$(basename "$dev")
+  block=""
+  for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
+    if [ -r "$dev/$f" ]; then
+      if [ -z "$block" ]; then
+        block=1
+        found=1
+        echo "=== $bdf ==="
+      fi
+      printf "  %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
+    fi
+  done
+  if [ -n "$block" ]; then
+    echo
+  fi
+done
+if [ "$found" -eq 0 ]; then
+  echo "no PCIe AER sysfs counters found"
+fi
 `}},
 	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -13,6 +13,7 @@ import (
 const nvidiaVendorID = 0x10de

 type nvidiaGPUInfo struct {
+	Index              int
 	BDF                string
 	Serial             string
 	VBIOS              string
@@ -132,6 +133,7 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		}

 		info := nvidiaGPUInfo{
+			Index:              parseRequiredInt(rec[0]),
 			BDF:                bdf,
 			Serial:             strings.TrimSpace(rec[2]),
 			VBIOS:              strings.TrimSpace(rec[3]),
@@ -187,6 +189,14 @@ func parseMaybeInt(v string) *int {
 	return &n
 }

+func parseRequiredInt(v string) int {
+	n, err := strconv.Atoi(strings.TrimSpace(v))
+	if err != nil {
+		return 0
+	}
+	return n
+}
+
 func pcieLinkGenLabel(gen int) string {
 	return fmt.Sprintf("Gen%d", gen)
 }
@@ -240,6 +250,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
 }

 func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
+	if dev.Telemetry == nil {
+		dev.Telemetry = map[string]any{}
+	}
+	dev.Telemetry["nvidia_gpu_index"] = info.Index
 	if info.TemperatureC != nil {
 		dev.TemperatureC = info.TemperatureC
 	}
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -86,6 +86,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
 	if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
 		t.Fatalf("firmware: got %v", out[0].Firmware)
 	}
+	if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
+		t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
+	}
 	if out[0].Status == nil || *out[0].Status != statusWarning {
 		t.Fatalf("status: got %v", out[0].Status)
 	}
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -27,14 +27,17 @@ type benchmarkProfileSpec struct {
 }

 type benchmarkGPUInfo struct {
-	Index               int
-	UUID                string
-	Name                string
-	BusID               string
-	VBIOS               string
-	PowerLimitW         float64
-	MaxGraphicsClockMHz float64
-	MaxMemoryClockMHz   float64
+	Index                int
+	UUID                 string
+	Name                 string
+	BusID                string
+	VBIOS                string
+	PowerLimitW          float64
+	DefaultPowerLimitW   float64
+	MaxGraphicsClockMHz  float64
+	MaxMemoryClockMHz    float64
+	BaseGraphicsClockMHz float64
+	MultiprocessorCount  int
 }

 type benchmarkBurnProfile struct {
@@ -102,7 +105,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		BenchmarkVersion:   benchmarkVersion,
 		GeneratedAt:        time.Now().UTC(),
 		Hostname:           hostname,
+		ServerModel:        readServerModel(),
 		BenchmarkProfile:   spec.Name,
+		ParallelGPUs:       opts.ParallelGPUs,
 		SelectedGPUIndices: append([]int(nil), selected...),
 		Normalization: BenchmarkNormalization{
 			Status: "full",
@@ -111,6 +116,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv

 	logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))

+	// Server power characterization state — populated during per-GPU phases.
+	var serverIdleW, serverLoadedWSum float64
+	var serverIdleOK, serverLoadedOK bool
+	var serverLoadedSamples int
+
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
@@ -135,6 +145,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()

+	if opts.ParallelGPUs {
+		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
+	} else {
+
 	for _, idx := range selected {
 		gpuResult := BenchmarkGPUResult{
 			Index:  idx,
@@ -146,7 +160,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			gpuResult.BusID = info.BusID
 			gpuResult.VBIOS = info.VBIOS
 			gpuResult.PowerLimitW = info.PowerLimitW
+			gpuResult.MultiprocessorCount = info.MultiprocessorCount
+			gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
 			gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
+			gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
@@ -161,6 +178,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)

+		// Sample server idle power once (first GPU only — server state is global).
+		if !serverIdleOK {
+			if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+				serverIdleW = w
+				serverIdleOK = true
+				logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+			}
+		}
+
 		warmupCmd := []string{
 			"bee-gpu-burn",
 			"--seconds", strconv.Itoa(spec.WarmupSec),
@@ -184,7 +210,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			"--devices", strconv.Itoa(idx),
 		}
 		logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
+
+		// Sample server power via IPMI in parallel with the steady phase.
+		// We collect readings every 5s and average them.
+		ipmiStopCh := make(chan struct{})
+		ipmiResultCh := make(chan float64, 1)
+		go func() {
+			defer close(ipmiResultCh)
+			var samples []float64
+			ticker := time.NewTicker(5 * time.Second)
+			defer ticker.Stop()
+			// First sample after a short warmup delay.
+			select {
+			case <-ipmiStopCh:
+				return
+			case <-time.After(15 * time.Second):
+			}
+			for {
+				if w, err := queryIPMIServerPowerW(); err == nil {
+					samples = append(samples, w)
+				}
+				select {
+				case <-ipmiStopCh:
+					if len(samples) > 0 {
+						var sum float64
+						for _, w := range samples {
+							sum += w
+						}
+						ipmiResultCh <- sum / float64(len(samples))
+					}
+					return
+				case <-ticker.C:
+				}
+			}
+		}()
+
 		steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
+		close(ipmiStopCh)
+		if loadedW, ok := <-ipmiResultCh; ok {
+			serverLoadedWSum += loadedW
+			serverLoadedSamples++
+			serverLoadedOK = true
+			logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
+		}
+
 		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
 		afterThrottle, _ := queryThrottleCounters(idx)
 		if steadyErr != nil {
@@ -222,6 +291,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
 	}

+	} // end sequential path
+
 	if len(selected) > 1 && opts.RunNCCL {
 		result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
 		if result.Interconnect != nil && result.Interconnect.Supported {
@@ -232,6 +303,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}

+	// Compute server power characterization from accumulated IPMI samples.
+	var gpuReportedSumW float64
+	for _, gpu := range result.GPUs {
+		gpuReportedSumW += gpu.Steady.AvgPowerW
+	}
+	var serverLoadedW float64
+	if serverLoadedSamples > 0 {
+		serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
+	}
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
+
 	result.Findings = buildBenchmarkFindings(result)
 	result.OverallStatus = benchmarkOverallStatus(result)

@@ -243,9 +325,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		return "", fmt.Errorf("write result.json: %w", err)
 	}

-	report := renderBenchmarkReport(result)
-	if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
-		return "", fmt.Errorf("write report.txt: %w", err)
+	report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
+	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
+		return "", fmt.Errorf("write report.md: %w", err)
 	}

 	summary := renderBenchmarkSummary(result)
@@ -288,50 +370,87 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 	}
 }

-func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
-	args := []string{
-		"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
-		"--format=csv,noheader,nounits",
-	}
-	if len(gpuIndices) > 0 {
-		args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
-	}
-	out, err := satExecCommand("nvidia-smi", args...).Output()
-	if err != nil {
-		return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
-	}
-
-	r := csv.NewReader(strings.NewReader(string(out)))
-	r.TrimLeadingSpace = true
-	r.FieldsPerRecord = -1
-	rows, err := r.ReadAll()
-	if err != nil {
-		return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
-	}
-
-	infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
-	for _, row := range rows {
-		if len(row) < 8 {
-			continue
-		}
-		idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
-		if err != nil {
-			continue
-		}
-		infoByIndex[idx] = benchmarkGPUInfo{
-			Index:               idx,
-			UUID:                strings.TrimSpace(row[1]),
-			Name:                strings.TrimSpace(row[2]),
-			BusID:               strings.TrimSpace(row[3]),
-			VBIOS:               strings.TrimSpace(row[4]),
-			PowerLimitW:         parseBenchmarkFloat(row[5]),
-			MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
-			MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
-		}
-	}
-	return infoByIndex, nil
+// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
+// Fields are tried in order; the first successful query wins. Extended fields
+// (attribute.multiprocessor_count, power.default_limit) are not supported on
+// all driver versions, so we fall back to the base set if the full query fails.
+var benchmarkGPUInfoQueries = []struct {
+	fields   string
+	extended bool // whether this query includes optional extended fields
+}{
+	{
+		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
+		extended: true,
+	},
+	{
+		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
+		extended: false,
+	},
 }

+func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
+	var lastErr error
+	for _, q := range benchmarkGPUInfoQueries {
+		args := []string{
+			"--query-gpu=" + q.fields,
+			"--format=csv,noheader,nounits",
+		}
+		if len(gpuIndices) > 0 {
+			args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
+		}
+		out, err := satExecCommand("nvidia-smi", args...).Output()
+		if err != nil {
+			lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
+			continue
+		}
+
+		r := csv.NewReader(strings.NewReader(string(out)))
+		r.TrimLeadingSpace = true
+		r.FieldsPerRecord = -1
+		rows, err := r.ReadAll()
+		if err != nil {
+			lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
+			continue
+		}
+
+		infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
+		for _, row := range rows {
+			if len(row) < 9 {
+				continue
+			}
+			idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
+			if err != nil {
+				continue
+			}
+			info := benchmarkGPUInfo{
+				Index:               idx,
+				UUID:                strings.TrimSpace(row[1]),
+				Name:                strings.TrimSpace(row[2]),
+				BusID:               strings.TrimSpace(row[3]),
+				VBIOS:               strings.TrimSpace(row[4]),
+				PowerLimitW:         parseBenchmarkFloat(row[5]),
+				MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
+				MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
+			}
+			if len(row) >= 9 {
+				info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
+			}
+			if q.extended {
+				if len(row) >= 10 {
+					info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
+				}
+				if len(row) >= 11 {
+					info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
+				}
+			}
+			infoByIndex[idx] = info
+		}
+		return infoByIndex, nil
+	}
+	return nil, lastErr
+}
+
+
 func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
 	if os.Geteuid() != 0 {
 		result.Normalization.Status = "partial"
@@ -370,6 +489,10 @@ func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndi
 					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
 				}})
 			}
+		} else {
+			rec.GPUClockLockStatus = "skipped"
+			rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
+			result.Normalization.Status = "partial"
 		}

 		if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
@@ -551,6 +674,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
 	}
 	category := "other"
 	switch {
+	case strings.HasPrefix(name, "fp64"):
+		category = "fp64"
 	case strings.HasPrefix(name, "fp32"):
 		category = "fp32_tf32"
 	case strings.HasPrefix(name, "fp16"):
@@ -619,14 +744,23 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 			score.ComputeScore += precision.TeraOpsPerSec
 		}
 	}
-	if gpu.PowerLimitW > 0 {
-		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100)
+	// Use default power limit for sustain score so a manually reduced limit
+	// does not inflate the score. Fall back to enforced limit if default unknown.
+	referencePowerW := gpu.DefaultPowerLimitW
+	if referencePowerW <= 0 {
+		referencePowerW = gpu.PowerLimitW
+	}
+	if referencePowerW > 0 {
+		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
 	}
 	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
 	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
 	score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
 	score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
 	score.CompositeScore = compositeBenchmarkScore(score)
+	if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
+		score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
+	}
 	return score
 }

@@ -679,7 +813,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
 		"-g", strconv.Itoa(len(gpuIndices)),
 		"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
 	}
-	env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	env := []string{
+		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
+		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
+	}
 	logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
 	out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
@@ -795,10 +932,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {

 func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 	var findings []string
+
+	passed := 0
+	for _, gpu := range result.GPUs {
+		if gpu.Status == "OK" {
+			passed++
+		}
+	}
+	total := len(result.GPUs)
+	if total > 0 {
+		if passed == total {
+			findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
+		} else {
+			findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
+		}
+	}
+
 	if result.Normalization.Status != "full" {
 		findings = append(findings, "Environment normalization was partial; compare results with caution.")
 	}
 	for _, gpu := range result.GPUs {
+		if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
+			findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
+			continue
+		}
 		if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
 			findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
 			continue
@@ -822,10 +979,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 		if gpu.Backend == "driver-ptx" {
 			findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
 		}
+		if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
+			findings = append(findings, fmt.Sprintf(
+				"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
+				gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
+			))
+		}
 	}
 	if result.Interconnect != nil && result.Interconnect.Supported {
 		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
 	}
+	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
+		if sp.ReportingRatio < 0.75 {
+			findings = append(findings, fmt.Sprintf(
+				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
+				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
+			))
+		}
+	}
 	return dedupeStrings(findings)
 }

@@ -1004,3 +1175,309 @@ func maxInt(a, b int) int {
 	}
 	return b
 }
+
+// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
+// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
+func queryIPMIServerPowerW() (float64, error) {
+	out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
+	if err != nil {
+		return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
+	}
+	if w := parseDCMIPowerReading(string(out)); w > 0 {
+		return w, nil
+	}
+	return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
+}
+
+// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
+// durationSec seconds. Returns the mean of all successful samples.
+// Returns 0, false if IPMI is unavailable.
+func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
+	if durationSec <= 0 {
+		return 0, false
+	}
+	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
+	var samples []float64
+	for {
+		if w, err := queryIPMIServerPowerW(); err == nil {
+			samples = append(samples, w)
+		}
+		if time.Now().After(deadline) {
+			break
+		}
+		select {
+		case <-ctx.Done():
+			break
+		case <-time.After(2 * time.Second):
+		}
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	var sum float64
+	for _, w := range samples {
+		sum += w
+	}
+	return sum / float64(len(samples)), true
+}
+
+// characterizeServerPower computes BenchmarkServerPower from idle and loaded
+// IPMI samples plus the GPU-reported average power during steady state.
+func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
+	sp := &BenchmarkServerPower{Available: ipmiAvailable}
+	if !ipmiAvailable {
+		sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
+		return sp
+	}
+	sp.IdleW = idleW
+	sp.LoadedW = loadedW
+	sp.DeltaW = loadedW - idleW
+	sp.GPUReportedSumW = gpuReportedSumW
+	if gpuReportedSumW > 0 && sp.DeltaW > 0 {
+		sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
+	}
+	return sp
+}
+
+// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
+// Returns empty string if unavailable (non-Linux or missing DMI entry).
+func readServerModel() string {
+	data, err := os.ReadFile("/sys/class/dmi/id/product_name")
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(data))
+}
+
+// filterRowsByGPU returns only the metric rows for a specific GPU index.
+func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
+	var out []GPUMetricRow
+	for _, r := range rows {
+		if r.GPUIndex == gpuIndex {
+			out = append(out, r)
+		}
+	}
+	return out
+}
+
+// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
+// and returns a per-GPU parse result map.
+func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
+	gpuLines := make(map[int][]string)
+	for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
+		line = strings.TrimSpace(line)
+		if !strings.HasPrefix(line, "[gpu ") {
+			continue
+		}
+		end := strings.Index(line, "] ")
+		if end < 0 {
+			continue
+		}
+		gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
+		if err != nil {
+			continue
+		}
+		gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
+	}
+	results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
+	for gpuIdx, lines := range gpuLines {
+		// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
+		// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
+		results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
+	}
+	return results
+}
+
+// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
+// simultaneously using a single bee-gpu-burn invocation per phase.
+func runNvidiaBenchmarkParallel(
+	ctx context.Context,
+	verboseLog, runDir string,
+	selected []int,
+	infoByIndex map[int]benchmarkGPUInfo,
+	opts NvidiaBenchmarkOptions,
+	spec benchmarkProfileSpec,
+	logFunc func(string),
+	result *NvidiaBenchmarkResult,
+	serverIdleW *float64, serverLoadedWSum *float64,
+	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
+) {
+	allDevices := joinIndexList(selected)
+
+	// Build per-GPU result stubs.
+	gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
+	for _, idx := range selected {
+		r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
+		if info, ok := infoByIndex[idx]; ok {
+			r.UUID = info.UUID
+			r.Name = info.Name
+			r.BusID = info.BusID
+			r.VBIOS = info.VBIOS
+			r.PowerLimitW = info.PowerLimitW
+			r.MultiprocessorCount = info.MultiprocessorCount
+			r.DefaultPowerLimitW = info.DefaultPowerLimitW
+			r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
+			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
+			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
+		}
+		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
+			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
+			r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
+		}
+		gpuResults[idx] = r
+	}
+
+	// Baseline: sample all GPUs together.
+	baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
+	if err != nil && err != context.Canceled {
+		for _, idx := range selected {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
+		}
+	}
+	for _, idx := range selected {
+		perGPU := filterRowsByGPU(baselineRows, idx)
+		gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
+	}
+
+	// Sample server idle power once.
+	if !*serverIdleOK {
+		if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+			*serverIdleW = w
+			*serverIdleOK = true
+			logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+		}
+	}
+
+	// Warmup: all GPUs simultaneously.
+	warmupCmd := []string{
+		"bee-gpu-burn",
+		"--seconds", strconv.Itoa(spec.WarmupSec),
+		"--size-mb", strconv.Itoa(opts.SizeMB),
+		"--devices", allDevices,
+	}
+	logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
+	warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
+	for _, idx := range selected {
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
+	}
+	if warmupErr != nil {
+		for _, idx := range selected {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
+		}
+	}
+
+	// Snapshot throttle counters before steady.
+	beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
+	for _, idx := range selected {
+		beforeThrottle[idx], _ = queryThrottleCounters(idx)
+	}
+
+	// Steady: all GPUs simultaneously.
+	steadyCmd := []string{
+		"bee-gpu-burn",
+		"--seconds", strconv.Itoa(spec.SteadySec),
+		"--size-mb", strconv.Itoa(opts.SizeMB),
+		"--devices", allDevices,
+	}
+	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
+
+	// Sample server power via IPMI in parallel with steady phase.
+	ipmiStopCh := make(chan struct{})
+	ipmiResultCh := make(chan float64, 1)
+	go func() {
+		defer close(ipmiResultCh)
+		var samples []float64
+		ticker := time.NewTicker(5 * time.Second)
+		defer ticker.Stop()
+		select {
+		case <-ipmiStopCh:
+			return
+		case <-time.After(15 * time.Second):
+		}
+		for {
+			if w, err := queryIPMIServerPowerW(); err == nil {
+				samples = append(samples, w)
+			}
+			select {
+			case <-ipmiStopCh:
+				if len(samples) > 0 {
+					var sum float64
+					for _, w := range samples {
+						sum += w
+					}
+					ipmiResultCh <- sum / float64(len(samples))
+				}
+				return
+			case <-ticker.C:
+			}
+		}
+	}()
+
+	steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
+	close(ipmiStopCh)
+	if loadedW, ok := <-ipmiResultCh; ok {
+		*serverLoadedWSum += loadedW
+		(*serverLoadedSamples)++
+		*serverLoadedOK = true
+		logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
+	}
+	_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
+
+	afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
+	for _, idx := range selected {
+		afterThrottle[idx], _ = queryThrottleCounters(idx)
+	}
+
+	parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
+
+	for _, idx := range selected {
+		perGPU := filterRowsByGPU(steadyRows, idx)
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
+		gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
+		gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
+
+		if pr, ok := parseResults[idx]; ok {
+			gpuResults[idx].ComputeCapability = pr.ComputeCapability
+			gpuResults[idx].Backend = pr.Backend
+			gpuResults[idx].PrecisionResults = pr.Profiles
+			if pr.Fallback {
+				gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
+			}
+		}
+		if steadyErr != nil {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
+		}
+	}
+
+	// Cooldown: all GPUs together.
+	cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
+	if err != nil && err != context.Canceled {
+		for _, idx := range selected {
+			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
+		}
+	}
+	for _, idx := range selected {
+		perGPU := filterRowsByGPU(cooldownRows, idx)
+		gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
+	}
+
+	// Score and finalize each GPU.
+	for _, idx := range selected {
+		r := gpuResults[idx]
+		r.Scores = scoreBenchmarkGPUResult(*r)
+		r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
+		pr := parseResults[idx]
+		switch {
+		case steadyErr != nil:
+			r.Status = classifySATErrorStatus(steadyOut, steadyErr)
+		case pr.Fallback:
+			r.Status = "PARTIAL"
+		default:
+			r.Status = "OK"
+		}
+		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
+	}
+}
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -2,24 +2,73 @@ package platform

 import (
 	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
 	"strings"
 	"time"
 )

 func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
-	var b strings.Builder
-	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
-	fmt.Fprintf(&b, "===========================\n\n")
-	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
-	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
-	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
-	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
-	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
+	return renderBenchmarkReportWithCharts(result, nil)
+}

+type benchmarkReportChart struct {
+	Title   string
+	Content string
+}
+
+var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
+
+func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
+	var b strings.Builder
+
+	// ── Header ────────────────────────────────────────────────────────────────
+	b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
+
+	// System identity block
+	if result.ServerModel != "" {
+		fmt.Fprintf(&b, "**Server:** %s  \n", result.ServerModel)
+	}
+	if result.Hostname != "" {
+		fmt.Fprintf(&b, "**Host:** %s  \n", result.Hostname)
+	}
+	// GPU models summary
+	if len(result.GPUs) > 0 {
+		modelCount := make(map[string]int)
+		var modelOrder []string
+		for _, g := range result.GPUs {
+			m := strings.TrimSpace(g.Name)
+			if m == "" {
+				m = "Unknown GPU"
+			}
+			if modelCount[m] == 0 {
+				modelOrder = append(modelOrder, m)
+			}
+			modelCount[m]++
+		}
+		var parts []string
+		for _, m := range modelOrder {
+			if modelCount[m] == 1 {
+				parts = append(parts, m)
+			} else {
+				parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
+			}
+		}
+		fmt.Fprintf(&b, "**GPU(s):** %s  \n", strings.Join(parts, ", "))
+	}
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	if result.ParallelGPUs {
+		fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously)  \n")
+	}
+	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
+	b.WriteString("\n")
+
+	// ── Executive Summary ─────────────────────────────────────────────────────
 	if len(result.Findings) > 0 {
-		fmt.Fprintf(&b, "Executive Summary\n")
-		fmt.Fprintf(&b, "-----------------\n")
+		b.WriteString("## Executive Summary\n\n")
 		for _, finding := range result.Findings {
 			fmt.Fprintf(&b, "- %s\n", finding)
 		}
@@ -27,96 +76,250 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 	}

 	if len(result.Warnings) > 0 {
-		fmt.Fprintf(&b, "Warnings\n")
-		fmt.Fprintf(&b, "--------\n")
+		b.WriteString("## Warnings\n\n")
 		for _, warning := range result.Warnings {
 			fmt.Fprintf(&b, "- %s\n", warning)
 		}
 		b.WriteString("\n")
 	}

-	fmt.Fprintf(&b, "Per GPU Scorecard\n")
-	fmt.Fprintf(&b, "-----------------\n")
+	// ── Scorecard table ───────────────────────────────────────────────────────
+	b.WriteString("## Scorecard\n\n")
+	b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
+	b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
 	for _, gpu := range result.GPUs {
-		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
-		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
-		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
-		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
-		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
-		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
-		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
+		name := strings.TrimSpace(gpu.Name)
+		if name == "" {
+			name = "Unknown"
+		}
+		interconnect := "-"
 		if gpu.Scores.InterconnectScore > 0 {
-			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
+			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
 		}
-		if len(gpu.DegradationReasons) > 0 {
-			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
+		topsPerSM := "-"
+		if gpu.Scores.TOPSPerSMPerGHz > 0 {
+			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
 		}
-		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
-		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
-		if len(gpu.PrecisionResults) > 0 {
-			fmt.Fprintf(&b, "  Precision results:\n")
-			for _, precision := range gpu.PrecisionResults {
-				if precision.Supported {
-					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
-				} else {
-					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
-				}
-			}
-		}
-		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
-			gpu.Throttle.SWPowerCapUS,
-			gpu.Throttle.SWThermalSlowdownUS,
-			gpu.Throttle.SyncBoostUS,
-			gpu.Throttle.HWThermalSlowdownUS,
-			gpu.Throttle.HWPowerBrakeSlowdownUS,
+		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
+			gpu.Index, name,
+			gpu.Status,
+			gpu.Scores.CompositeScore,
+			gpu.Scores.ComputeScore,
+			topsPerSM,
+			gpu.Scores.PowerSustainScore,
+			gpu.Scores.ThermalSustainScore,
+			gpu.Scores.StabilityScore,
+			interconnect,
 		)
-		if len(gpu.Notes) > 0 {
-			fmt.Fprintf(&b, "  Notes:\n")
-			for _, note := range gpu.Notes {
-				fmt.Fprintf(&b, "    - %s\n", note)
-			}
+	}
+	b.WriteString("\n")
+
+	// ── Per GPU detail ────────────────────────────────────────────────────────
+	b.WriteString("## Per-GPU Details\n\n")
+	for _, gpu := range result.GPUs {
+		name := strings.TrimSpace(gpu.Name)
+		if name == "" {
+			name = "Unknown GPU"
+		}
+		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
+
+		// Identity
+		if gpu.BusID != "" {
+			fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
+		}
+		if gpu.VBIOS != "" {
+			fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
+		}
+		if gpu.ComputeCapability != "" {
+			fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
+		}
+		if gpu.MultiprocessorCount > 0 {
+			fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
+		}
+		if gpu.PowerLimitW > 0 {
+			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
+		}
+		if gpu.LockedGraphicsClockMHz > 0 {
+			fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
 		}
 		b.WriteString("\n")
+
+		// Steady-state telemetry
+		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
+		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
+		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
+		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
+		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
+		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+		b.WriteString("\n")
+
+		// Throttle
+		throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
+		if throttle != "none" {
+			fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
+		}
+
+		// Precision results
+		if len(gpu.PrecisionResults) > 0 {
+			b.WriteString("**Precision results:**\n\n")
+			b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
+			for _, p := range gpu.PrecisionResults {
+				if p.Supported {
+					fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
+				} else {
+					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
+				}
+			}
+			b.WriteString("\n")
+		}
+
+		// Degradation / Notes
+		if len(gpu.DegradationReasons) > 0 {
+			fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
+		}
+		if len(gpu.Notes) > 0 {
+			b.WriteString("**Notes:**\n\n")
+			for _, note := range gpu.Notes {
+				fmt.Fprintf(&b, "- %s\n", note)
+			}
+			b.WriteString("\n")
+		}
 	}

+	// ── Interconnect ──────────────────────────────────────────────────────────
 	if result.Interconnect != nil {
-		fmt.Fprintf(&b, "Interconnect\n")
-		fmt.Fprintf(&b, "------------\n")
-		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
+		b.WriteString("## Interconnect (NCCL)\n\n")
+		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
-			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
-			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
+			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
+			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
-		b.WriteString("\n")
+		if len(result.Interconnect.Notes) > 0 {
+			b.WriteString("\n")
+		}
 	}

-	fmt.Fprintf(&b, "Methodology\n")
-	fmt.Fprintf(&b, "-----------\n")
-	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
-	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
-	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
+	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	if sp := result.ServerPower; sp != nil {
+		b.WriteString("## Server Power (IPMI)\n\n")
+		if !sp.Available {
+			b.WriteString("IPMI power measurement unavailable.\n\n")
+		} else {
+			b.WriteString("| | Value |\n|---|---|\n")
+			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
+			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
+			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
+			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
+			if sp.ReportingRatio > 0 {
+				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
+			}
+			b.WriteString("\n")
+		}
+		for _, note := range sp.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		if len(sp.Notes) > 0 {
+			b.WriteString("\n")
+		}
+	}

-	fmt.Fprintf(&b, "Raw Files\n")
-	fmt.Fprintf(&b, "---------\n")
-	fmt.Fprintf(&b, "- result.json\n")
-	fmt.Fprintf(&b, "- report.txt\n")
-	fmt.Fprintf(&b, "- summary.txt\n")
-	fmt.Fprintf(&b, "- verbose.log\n")
-	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
-	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
-	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
-	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
-	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
+	// ── Terminal charts (steady-state only) ───────────────────────────────────
+	if len(charts) > 0 {
+		b.WriteString("## Steady-State Charts\n\n")
+		for _, chart := range charts {
+			content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
+			if content == "" {
+				continue
+			}
+			fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
+		}
+	}
+
+	// ── Methodology ───────────────────────────────────────────────────────────
+	b.WriteString("## Methodology\n\n")
+	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
+	b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
+	b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
+	b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
+
+	// ── Raw files ─────────────────────────────────────────────────────────────
+	b.WriteString("## Raw Files\n\n")
+	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
+	b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
+	b.WriteString("- `gpu-*-warmup.log`\n")
+	b.WriteString("- `gpu-*-steady.log`\n")
+	b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
+	b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
 	if result.Interconnect != nil {
-		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
+		b.WriteString("- `nccl-all-reduce.log`\n")
 	}
 	return b.String()
 }

+// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
+// cooldown charts are not useful for human review).
+func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
+	var charts []benchmarkReportChart
+	for _, idx := range gpuIndices {
+		path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
+		raw, err := os.ReadFile(path)
+		if err != nil || len(raw) == 0 {
+			continue
+		}
+		charts = append(charts, benchmarkReportChart{
+			Title:   fmt.Sprintf("GPU %d — Steady State", idx),
+			Content: string(raw),
+		})
+	}
+	return charts
+}
+
+func stripANSIEscapeSequences(raw string) string {
+	return ansiEscapePattern.ReplaceAllString(raw, "")
+}
+
+// formatThrottleLine renders throttle counters as human-readable percentages of
+// the steady-state window.  Only non-zero counters are shown.  When the steady
+// duration is unknown (0), raw seconds are shown instead.
+func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
+	type counter struct {
+		label string
+		us    uint64
+	}
+	counters := []counter{
+		{"sw_power", t.SWPowerCapUS},
+		{"sw_thermal", t.SWThermalSlowdownUS},
+		{"sync_boost", t.SyncBoostUS},
+		{"hw_thermal", t.HWThermalSlowdownUS},
+		{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
+	}
+	var parts []string
+	for _, c := range counters {
+		if c.us == 0 {
+			continue
+		}
+		sec := float64(c.us) / 1e6
+		if steadyDurationSec > 0 {
+			pct := sec / steadyDurationSec * 100
+			parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
+		} else if sec < 1 {
+			parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
+		} else {
+			parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
+		}
+	}
+	if len(parts) == 0 {
+		return "none"
+	}
+	return strings.Join(parts, "  ")
+}
+
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -137,11 +137,44 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 	for _, needle := range []string{
 		"Executive Summary",
 		"GPU 0 spent measurable time under SW power cap.",
-		"Composite score: 1176.00",
-		"fp16_tensor: 700.00 TOPS",
+		"1176.00",
+		"fp16_tensor",
+		"700.00",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
 		}
 	}
 }
+
+func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
+	t.Parallel()
+
+	report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "OK",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "full",
+		},
+	}, []benchmarkReportChart{
+		{
+			Title:   "GPU 0 Steady State",
+			Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
+		},
+	})
+
+	for _, needle := range []string{
+		"Steady-State Charts",
+		"GPU 0 Steady State",
+		"GPU 0 chart",
+		"42┤───",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+	if strings.Contains(report, "\x1b[31m") {
+		t.Fatalf("report should not contain ANSI escapes\n%s", report)
+	}
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -14,13 +14,17 @@ type NvidiaBenchmarkOptions struct {
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
+	ParallelGPUs      bool // run all selected GPUs simultaneously instead of sequentially
 }

+
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion   string                       `json:"benchmark_version"`
 	GeneratedAt        time.Time                    `json:"generated_at"`
 	Hostname           string                       `json:"hostname,omitempty"`
+	ServerModel        string                       `json:"server_model,omitempty"`
 	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`
@@ -28,6 +32,7 @@ type NvidiaBenchmarkResult struct {
 	Normalization      BenchmarkNormalization       `json:"normalization"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -56,7 +61,10 @@ type BenchmarkGPUResult struct {
 	Backend                string                     `json:"backend,omitempty"`
 	Status                 string                     `json:"status"`
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
+	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
+	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
+	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
 	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
 	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
@@ -117,6 +125,24 @@ type BenchmarkScorecard struct {
 	StabilityScore      float64 `json:"stability_score"`
 	InterconnectScore   float64 `json:"interconnect_score"`
 	CompositeScore      float64 `json:"composite_score"`
+	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
+	// Comparable across throttle levels and GPU generations. Low value at normal
+	// clocks indicates silicon degradation.
+	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
+// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
+// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
+// over-reporting its power consumption.
+type BenchmarkServerPower struct {
+	Available       bool     `json:"available"`
+	IdleW           float64  `json:"idle_w,omitempty"`
+	LoadedW         float64  `json:"loaded_w,omitempty"`
+	DeltaW          float64  `json:"delta_w,omitempty"`
+	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
+	Notes           []string `json:"notes,omitempty"`
 }

 type BenchmarkInterconnectResult struct {
--- a/audit/internal/platform/hpl.go
+++ b/audit/internal/platform/hpl.go
@@ -0,0 +1,142 @@
+package platform
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// HPLOptions configures the HPL (LINPACK) benchmark run.
+type HPLOptions struct {
+	MemFraction float64 // fraction of RAM to use (default 0.80)
+	NB          int     // block size (default 256)
+}
+
+// HPLResult holds the parsed result of an HPL run.
+type HPLResult struct {
+	N          int     // matrix dimension
+	NB         int     // block size
+	P          int     // process grid rows
+	Q          int     // process grid cols
+	TimeSec    float64 // wall time in seconds
+	GFlops     float64 // achieved performance
+	Residual   float64 // backward error residual (from HPL verification line)
+	Status     string  // "PASSED" or "FAILED"
+	RawOutput  string  // full xhpl output
+}
+
+func applyHPLDefaults(opts *HPLOptions) {
+	if opts.MemFraction <= 0 || opts.MemFraction > 1 {
+		opts.MemFraction = 0.80
+	}
+	if opts.NB <= 0 {
+		opts.NB = 256
+	}
+}
+
+// RunHPL runs bee-hpl and returns parsed results plus a tar.gz artifact path.
+func (s *System) RunHPL(ctx context.Context, baseDir string, opts HPLOptions, logFunc func(string)) (string, *HPLResult, error) {
+	applyHPLDefaults(&opts)
+
+	if baseDir == "" {
+		baseDir = "/var/log/bee-sat"
+	}
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "hpl-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", nil, fmt.Errorf("mkdir %s: %w", runDir, err)
+	}
+
+	logPath := filepath.Join(runDir, "hpl.log")
+
+	cmd := []string{
+		"bee-hpl",
+		"--mem-fraction", strconv.FormatFloat(opts.MemFraction, 'f', 2, 64),
+		"--nb", strconv.Itoa(opts.NB),
+	}
+
+	if logFunc != nil {
+		logFunc(fmt.Sprintf("HPL: N will be auto-sized to %.0f%% of RAM, NB=%d", opts.MemFraction*100, opts.NB))
+	}
+
+	out, err := runSATCommandCtx(ctx, "", "hpl", cmd, nil, logFunc)
+	_ = os.WriteFile(logPath, out, 0644)
+
+	result := parseHPLOutput(string(out))
+	result.RawOutput = string(out)
+
+	if err != nil && err != context.Canceled {
+		return "", result, fmt.Errorf("bee-hpl failed: %w", err)
+	}
+	if err == nil && result.GFlops <= 0 {
+		return "", result, fmt.Errorf("HPL completed but no Gflops result found in output")
+	}
+
+	// Write summary
+	summary := fmt.Sprintf("N=%d NB=%d time=%.2fs gflops=%.3f status=%s\n",
+		result.N, result.NB, result.TimeSec, result.GFlops, result.Status)
+	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
+
+	if logFunc != nil {
+		logFunc(fmt.Sprintf("HPL result: N=%d NB=%d %.2fs %.3f Gflops %s",
+			result.N, result.NB, result.TimeSec, result.GFlops, result.Status))
+	}
+
+	ts2 := time.Now().UTC().Format("20060102-150405")
+	archive := filepath.Join(baseDir, "hpl-"+ts2+".tar.gz")
+	if archErr := createTarGz(archive, runDir); archErr != nil {
+		return runDir, result, err
+	}
+	return archive, result, err
+}
+
+// parseHPLOutput extracts N, NB, time, and Gflops from standard HPL output.
+//
+// HPL prints a result line of the form:
+//
+//	WR00L2L2       45312   256     1     1        1234.56             5.678e+01
+//	T/V               N    NB     P     Q           Time                 Gflops
+func parseHPLOutput(output string) *HPLResult {
+	result := &HPLResult{Status: "FAILED"}
+	for _, line := range strings.Split(output, "\n") {
+		line = strings.TrimSpace(line)
+		// Result line starts with WR
+		if strings.HasPrefix(line, "WR") {
+			fields := strings.Fields(line)
+			// WR00L2L2  N  NB  P  Q  Time  Gflops
+			if len(fields) >= 7 {
+				result.N, _ = strconv.Atoi(fields[1])
+				result.NB, _ = strconv.Atoi(fields[2])
+				result.P, _ = strconv.Atoi(fields[3])
+				result.Q, _ = strconv.Atoi(fields[4])
+				result.TimeSec, _ = strconv.ParseFloat(fields[5], 64)
+				result.GFlops, _ = strconv.ParseFloat(fields[6], 64)
+			}
+		}
+		// Verification line: "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ... PASSED"
+		if strings.Contains(line, "PASSED") {
+			result.Status = "PASSED"
+			fields := strings.Fields(line)
+			for i, f := range fields {
+				if f == "PASSED" && i > 0 {
+					result.Residual, _ = strconv.ParseFloat(fields[i-1], 64)
+				}
+			}
+		}
+	}
+	return result
+}
+
+// hplAvailable returns true if bee-hpl and xhpl are present and executable.
+func hplAvailable() bool {
+	if _, err := exec.LookPath("bee-hpl"); err != nil {
+		return false
+	}
+	_, err := os.Stat("/usr/local/lib/bee/xhpl")
+	return err == nil
+}
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -116,25 +116,47 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 	if err := ctx.Err(); err != nil {
 		return err
 	}
-	if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
-		log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
+
+	mediumRebound := false
+	if err := bindMount(dstDir, "/run/live/medium"); err != nil {
+		log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
+	} else {
+		mediumRebound = true
 	}

 	log("Verifying live medium now served from RAM...")
 	status := s.LiveBootSource()
-	if err := verifyInstallToRAMStatus(status); err != nil {
+	if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
 		return err
 	}
-	log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
-	log("Done. Installation media can be safely disconnected.")
+	if status.InRAM {
+		log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
+	}
+	log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
 	return nil
 }

-func verifyInstallToRAMStatus(status LiveBootSource) error {
+func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
 	if status.InRAM {
 		return nil
 	}
-	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
+
+	// The live medium mount was not redirected to RAM. This is expected when
+	// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
+	// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
+	// because the CD-ROM mount is in use. Check whether files were at least
+	// copied to the tmpfs directory — that is sufficient for safe disconnection
+	// once the kernel has paged in all actively-used data.
+	files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
+	if len(files) > 0 {
+		if !mediumRebound {
+			log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
+			log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
+		}
+		return nil
+	}
+
+	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
 }

 func describeLiveBootSource(status LiveBootSource) string {
@@ -247,7 +269,31 @@ func findLoopForFile(backingFile string) (string, error) {
 	return "", fmt.Errorf("no loop device found for %s", backingFile)
 }

+// loopDeviceOffset returns the byte offset configured for the loop device,
+// or -1 if it cannot be determined.
+func loopDeviceOffset(loopDev string) int64 {
+	out, err := exec.Command("losetup", "--json", loopDev).Output()
+	if err != nil {
+		return -1
+	}
+	var result struct {
+		Loopdevices []struct {
+			Offset int64 `json:"offset"`
+		} `json:"loopdevices"`
+	}
+	if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
+		return -1
+	}
+	return result.Loopdevices[0].Offset
+}
+
 func reassociateLoopDevice(loopDev, newFile string) error {
+	// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
+	// typically set up with a non-zero offset (squashfs lives inside the ISO),
+	// so the ioctl returns EINVAL. Detect this early for a clear error message.
+	if off := loopDeviceOffset(loopDev); off > 0 {
+		return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
+	}
 	if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
 		return nil
 	}
--- a/audit/internal/platform/install_to_ram_linux.go
+++ b/audit/internal/platform/install_to_ram_linux.go
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
 	}
 	return nil
 }
+
+// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
+func bindMount(src, dst string) error {
+	return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
+}
--- a/audit/internal/platform/install_to_ram_other.go
+++ b/audit/internal/platform/install_to_ram_other.go
@@ -7,3 +7,7 @@ import "errors"
 func loopChangeFD(loopDev, newFile string) error {
 	return errors.New("LOOP_CHANGE_FD not available on this platform")
 }
+
+func bindMount(src, dst string) error {
+	return errors.New("bind mount not available on this platform")
+}
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -33,14 +33,17 @@ func TestInferLiveBootKind(t *testing.T) {
 func TestVerifyInstallToRAMStatus(t *testing.T) {
 	t.Parallel()

-	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
+	dstDir := t.TempDir()
+
+	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
 		t.Fatalf("expected success for RAM-backed status, got %v", err)
 	}
-	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
+
+	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
 	if err == nil {
 		t.Fatal("expected verification failure when media is still on USB")
 	}
-	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
+	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
 		t.Fatalf("error=%q", got)
 	}
 }
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -15,6 +15,10 @@ var workerPatterns = []string{
 	"stress-ng",
 	"stressapptest",
 	"memtester",
+	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
+	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
+	"nvvs",
+	"dcgmi",
 }

 // KilledProcess describes a process that was sent SIGKILL.
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
 		return "", err
 	}

-	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
+	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
 		job,
-		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func nvidiaStressArchivePrefix(loader string) string {
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
-				gpuCmd := buildGPUStressCmd(loadCtx, vendor)
+				gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
 				if gpuCmd == nil {
 					return
 				}
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
 		cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
 	}
 	cmd := exec.CommandContext(ctx, path, cmdArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	if err := startLowPriorityCmd(cmd, 15); err != nil {
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {

 // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
 // Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
-func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
+func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
 	switch strings.ToLower(vendor) {
 	case "amd":
-		return buildAMDGPUStressCmd(ctx)
+		return buildAMDGPUStressCmd(ctx, durSec)
 	case "nvidia":
-		return buildNvidiaGPUStressCmd(ctx)
+		return buildNvidiaGPUStressCmd(ctx, durSec)
 	}
 	return nil
 }

-func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	rvsArgs, err := resolveRVSCommand()
 	if err != nil {
 		return nil
 	}
 	rvsPath := rvsArgs[0]
-	cfg := `actions:
+	cfg := fmt.Sprintf(`actions:
 - name: gst_platform
  device: all
  module: gst
  parallel: true
-  duration: 86400000
+  duration: %d`, durSec*1000) + `
  copy_matrix: false
  target_stress: 90
  matrix_size_a: 8640
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
 	cfgFile := "/tmp/bee-platform-gst.conf"
 	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
 	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
 	return cmd
 }

-func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	path, err := satLookPath("bee-gpu-burn")
 	if err != nil {
 		path, err = satLookPath("bee-gpu-stress")
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
 	if err != nil {
 		return nil
 	}
-	cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
+	// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
+	// Process group kill via Setpgid+Cancel is kept as a safety net for cases
+	// where the context is cancelled early (user stop, parent timeout).
+	cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -173,6 +173,22 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe

 	switch vendor {
 	case "nvidia":
+		if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
+			health.NvidiaGSPMode = strings.TrimSpace(string(raw))
+			if health.NvidiaGSPMode == "gsp-stuck" {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "nvidia_gsp_stuck",
+					Severity:    "critical",
+					Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
+				})
+			} else if health.NvidiaGSPMode == "gsp-off" {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "nvidia_gsp_disabled",
+					Severity:    "warning",
+					Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
+				})
+			}
+		}
 		health.DriverReady = strings.Contains(lsmodText, "nvidia ")
 		if !health.DriverReady {
 			health.Issues = append(health.Issues, schema.RuntimeIssue{
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -21,10 +21,11 @@ import (
 )

 var (
-	satExecCommand = exec.Command
-	satLookPath    = exec.LookPath
-	satGlob        = filepath.Glob
-	satStat        = os.Stat
+	satExecCommand  = exec.Command
+	satLookPath     = exec.LookPath
+	satGlob         = filepath.Glob
+	satStat         = os.Stat
+	satFreeMemBytes = freeMemBytes

 	rocmSMIExecutableGlobs = []string{
 		"/opt/rocm/bin/rocm-smi",
@@ -87,6 +88,37 @@ type NvidiaGPU struct {
 	MemoryMB int    `json:"memory_mb"`
 }

+type NvidiaGPUStatus struct {
+	Index        int    `json:"index"`
+	Name         string `json:"name"`
+	BDF          string `json:"bdf,omitempty"`
+	Serial       string `json:"serial,omitempty"`
+	Status       string `json:"status"`
+	RawLine      string `json:"raw_line,omitempty"`
+	NeedsReset   bool   `json:"needs_reset"`
+	ParseFailure bool   `json:"parse_failure,omitempty"`
+}
+
+type nvidiaGPUHealth struct {
+	Index        int
+	Name         string
+	NeedsReset   bool
+	RawLine      string
+	ParseFailure bool
+}
+
+type nvidiaGPUStatusFile struct {
+	Index       int
+	Name        string
+	RunStatus   string
+	Reason      string
+	Health      string
+	HealthRaw   string
+	Observed    bool
+	Selected    bool
+	FailingJob  string
+}
+
 // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
 type AMDGPUInfo struct {
 	Index int    `json:"index"`
@@ -262,9 +294,78 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
 			MemoryMB: memMB,
 		})
 	}
+	sort.Slice(gpus, func(i, j int) bool {
+		return gpus[i].Index < gpus[j].Index
+	})
 	return gpus, nil
 }

+func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
+	out, err := satExecCommand(
+		"nvidia-smi",
+		"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
+		"--format=csv,noheader,nounits",
+	).Output()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi: %w", err)
+	}
+	var gpus []NvidiaGPUStatus
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		parts := strings.Split(line, ",")
+		if len(parts) < 4 {
+			gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
+			continue
+		}
+		idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+		if err != nil {
+			gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
+			continue
+		}
+		upper := strings.ToUpper(line)
+		needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
+		status := "OK"
+		if needsReset {
+			status = "RESET_REQUIRED"
+		}
+		gpus = append(gpus, NvidiaGPUStatus{
+			Index:      idx,
+			Name:       strings.TrimSpace(parts[1]),
+			BDF:        normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
+			Serial:     strings.TrimSpace(parts[3]),
+			Status:     status,
+			RawLine:    line,
+			NeedsReset: needsReset,
+		})
+	}
+	sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
+	return gpus, nil
+}
+
+func normalizeNvidiaBusID(v string) string {
+	v = strings.TrimSpace(strings.ToLower(v))
+	parts := strings.Split(v, ":")
+	if len(parts) == 3 && len(parts[0]) > 4 {
+		parts[0] = parts[0][len(parts[0])-4:]
+		return strings.Join(parts, ":")
+	}
+	return v
+}
+
+func (s *System) ResetNvidiaGPU(index int) (string, error) {
+	if index < 0 {
+		return "", fmt.Errorf("gpu index must be >= 0")
+	}
+	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
+	if len(raw) == 0 && err == nil {
+		raw = []byte("GPU reset completed.\n")
+	}
+	return string(raw), err
+}
+
 // RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
 func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
@@ -274,13 +375,13 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-all-reduce-perf.log", cmd: []string{
+	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
 		}},
-	}, logFunc)
+	), logFunc)
 }

 func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -292,18 +393,18 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
+		satJob{
 			name:       "03-dcgmproftester.log",
 			cmd:        profCmd,
 			env:        nvidiaVisibleDevicesEnv(selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -311,16 +412,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-targeted-power.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -328,16 +429,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-pulse-test.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -345,16 +446,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-nvbandwidth.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
@@ -378,16 +479,23 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-targeted-stress.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
@@ -404,9 +512,32 @@ func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
 	return all, nil
 }

-func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-	sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
-	passes := envInt("BEE_MEMTESTER_PASSES", 1)
+func memoryStressSizeArg() string {
+	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
+		return fmt.Sprintf("%dM", mb)
+	}
+	availBytes := satFreeMemBytes()
+	if availBytes <= 0 {
+		return "80%"
+	}
+	availMB := availBytes / (1024 * 1024)
+	targetMB := (availMB * 2) / 3
+	if targetMB >= 256 {
+		targetMB = (targetMB / 256) * 256
+	}
+	if targetMB <= 0 {
+		return "80%"
+	}
+	return fmt.Sprintf("%dM", targetMB)
+}
+
+func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
+	if sizeMB <= 0 {
+		sizeMB = 256
+	}
+	if passes <= 0 {
+		passes = 1
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
@@ -419,11 +550,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
 	if seconds <= 0 {
 		seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
 	}
-	// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
-	sizeArg := "80%"
-	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
-		sizeArg = fmt.Sprintf("%dM", mb)
-	}
+	// Base the default on current MemAvailable and keep headroom for the OS and
+	// concurrent stressors so mixed burn runs do not trip the OOM killer.
+	sizeArg := memoryStressSizeArg()
 	return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-stress-ng-vm.log", cmd: []string{
@@ -465,7 +594,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
 	}, logFunc)
 }

-func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if baseDir == "" {
 		baseDir = "/var/log/bee-sat"
 	}
@@ -497,7 +626,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
 			break
 		}
 		prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
-		commands := storageSATCommands(devPath)
+		commands := storageSATCommands(devPath, extended)
 		for cmdIndex, job := range commands {
 			if ctx.Err() != nil {
 				break
@@ -540,14 +669,24 @@ type satStats struct {
 	Unsupported int
 }

+func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
+	out := make([]satJob, 0, len(jobs)+1)
+	out = append(out, satJob{
+		name: "00-nvidia-smi-persistence-mode.log",
+		cmd:  []string{"nvidia-smi", "-pm", "1"},
+	})
+	out = append(out, jobs...)
+	return out
+}
+
 func nvidiaSATJobs() []satJob {
-	return []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
-		{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
-	}
+	return withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
+		satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
+	)
 }

 func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
@@ -562,12 +701,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 		}
 		diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
 	}
-	return []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-dcgmi-diag.log", cmd: diagArgs},
-	}
+	return withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
+	)
 }

 func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
@@ -592,7 +731,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
 	if len(gpuIndices) == 0 {
 		return nil
 	}
-	return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	return []string{
+		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
+		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
+	}
 }

 func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
@@ -611,11 +753,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa

 	var summary strings.Builder
 	stats := satStats{}
+	nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
+	perGPU := map[int]*nvidiaGPUStatusFile{}
+	selectedGPUIndices := map[int]struct{}{}
 	fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	for _, job := range jobs {
 		if ctx.Err() != nil {
 			break
 		}
+		for _, idx := range job.gpuIndices {
+			selectedGPUIndices[idx] = struct{}{}
+			status := perGPU[idx]
+			if status == nil {
+				status = &nvidiaGPUStatusFile{Index: idx}
+				perGPU[idx] = status
+			}
+			status.Selected = true
+		}
 		cmd := make([]string, 0, len(job.cmd))
 		for _, arg := range job.cmd {
 			cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
@@ -624,17 +778,52 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		var out []byte
 		var err error

-		if job.collectGPU {
-			out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
-		} else {
-			out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
+		if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
+			if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
+				if logFunc != nil {
+					logFunc(msg)
+				}
+				out = []byte(msg + "\n")
+				err = healthErr
+			}
+		}
+
+		if err == nil {
+			if job.collectGPU {
+				out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
+			} else {
+				out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
+			}
+		}
+
+		if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
+			if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
+				if logFunc != nil {
+					logFunc(msg)
+				}
+				if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
+					out = append(out, '\n')
+				}
+				out = append(out, []byte(msg+"\n")...)
+				if err == nil {
+					err = healthErr
+				}
+			}
 		}

 		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
 			return "", writeErr
 		}
+		if ctx.Err() != nil {
+			return "", ctx.Err()
+		}
 		status, rc := classifySATResult(job.name, out, err)
 		stats.Add(status)
+		if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
+			for _, idx := range job.gpuIndices {
+				updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
+			}
+		}
 		key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
 		fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
 		fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
@@ -643,6 +832,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
+	if nvidiaPack {
+		if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
+			return "", err
+		}
+	}

 	archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
 	if err := createTarGz(archive, runDir); err != nil {
@@ -651,6 +845,197 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	return archive, nil
 }

+func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
+	entry := perGPU[idx]
+	if entry == nil {
+		entry = &nvidiaGPUStatusFile{Index: idx}
+		perGPU[idx] = entry
+	}
+	if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
+		entry.RunStatus = status
+		entry.FailingJob = jobName
+		entry.Reason = firstLine(detail)
+	}
+}
+
+func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
+	health, err := readNvidiaGPUHealth()
+	if err == nil {
+		for _, gpu := range health {
+			entry := perGPU[gpu.Index]
+			if entry == nil {
+				entry = &nvidiaGPUStatusFile{Index: gpu.Index}
+				perGPU[gpu.Index] = entry
+			}
+			entry.Name = gpu.Name
+			entry.Observed = true
+			entry.HealthRaw = gpu.RawLine
+			if gpu.NeedsReset {
+				entry.Health = "RESET_REQUIRED"
+				if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
+					entry.RunStatus = "FAILED"
+					if strings.TrimSpace(entry.Reason) == "" {
+						entry.Reason = "GPU requires reset"
+					}
+				}
+			} else {
+				entry.Health = "OK"
+			}
+		}
+	}
+	for idx := range selected {
+		entry := perGPU[idx]
+		if entry == nil {
+			entry = &nvidiaGPUStatusFile{Index: idx}
+			perGPU[idx] = entry
+		}
+		entry.Selected = true
+	}
+	var indices []int
+	for idx := range perGPU {
+		indices = append(indices, idx)
+	}
+	sort.Ints(indices)
+	for _, idx := range indices {
+		entry := perGPU[idx]
+		if entry.RunStatus == "" {
+			entry.RunStatus = overall
+		}
+		if entry.Health == "" {
+			entry.Health = "UNKNOWN"
+		}
+		if entry.Name == "" {
+			entry.Name = "unknown"
+		}
+		var body strings.Builder
+		fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
+		fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
+		fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
+		fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
+		fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
+		fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
+		if strings.TrimSpace(entry.FailingJob) != "" {
+			fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
+		}
+		if strings.TrimSpace(entry.Reason) != "" {
+			fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
+		}
+		if strings.TrimSpace(entry.HealthRaw) != "" {
+			fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
+		}
+		if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func nvidiaSATStatusSeverity(status string) int {
+	switch strings.ToUpper(strings.TrimSpace(status)) {
+	case "FAILED":
+		return 3
+	case "PARTIAL", "UNSUPPORTED":
+		return 2
+	case "OK":
+		return 1
+	default:
+		return 0
+	}
+}
+
+func firstLine(s string) string {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return ""
+	}
+	if idx := strings.IndexByte(s, '\n'); idx >= 0 {
+		return strings.TrimSpace(s[:idx])
+	}
+	return s
+}
+
+func nvidiaJobNeedsHealthCheck(job satJob) bool {
+	if job.collectGPU {
+		return true
+	}
+	name := strings.ToLower(strings.TrimSpace(job.name))
+	return strings.Contains(name, "dcgmi") ||
+		strings.Contains(name, "gpu-burn") ||
+		strings.Contains(name, "gpu-stress") ||
+		strings.Contains(name, "dcgmproftester")
+}
+
+func checkNvidiaJobHealth(selected []int) (string, error) {
+	health, err := readNvidiaGPUHealth()
+	if err != nil {
+		return "", nil
+	}
+	var bad []nvidiaGPUHealth
+	selectedSet := make(map[int]struct{}, len(selected))
+	for _, idx := range selected {
+		selectedSet[idx] = struct{}{}
+	}
+	for _, gpu := range health {
+		if len(selectedSet) > 0 {
+			if _, ok := selectedSet[gpu.Index]; !ok {
+				continue
+			}
+		}
+		if gpu.NeedsReset {
+			bad = append(bad, gpu)
+		}
+	}
+	if len(bad) == 0 {
+		return "", nil
+	}
+	lines := make([]string, 0, len(bad)+1)
+	lines = append(lines, "NVIDIA GPU health check failed:")
+	for _, gpu := range bad {
+		lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
+	}
+	return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
+}
+
+func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
+	out, err := satExecCommand(
+		"nvidia-smi",
+		"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
+		"--format=csv,noheader,nounits",
+	).Output()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi: %w", err)
+	}
+	return parseNvidiaGPUHealth(string(out)), nil
+}
+
+func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
+	var gpus []nvidiaGPUHealth
+	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		parts := strings.Split(line, ",")
+		if len(parts) < 2 {
+			gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
+			continue
+		}
+		idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+		if err != nil {
+			gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
+			continue
+		}
+		upper := strings.ToUpper(line)
+		gpus = append(gpus, nvidiaGPUHealth{
+			Index:      idx,
+			Name:       strings.TrimSpace(parts[1]),
+			NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
+			RawLine:    line,
+		})
+	}
+	return gpus
+}
+
 func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
 	start := time.Now().UTC()
 	resolvedCmd, err := resolveSATCommand(cmd)
@@ -705,17 +1090,25 @@ func listStorageDevices() ([]string, error) {
 	return parseStorageDevices(string(out)), nil
 }

-func storageSATCommands(devPath string) []satJob {
+func storageSATCommands(devPath string, extended bool) []satJob {
 	if strings.Contains(filepath.Base(devPath), "nvme") {
+		selfTestLevel := "1"
+		if extended {
+			selfTestLevel = "2"
+		}
 		return []satJob{
 			{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
 			{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
-			{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
+			{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
 		}
 	}
+	smartTestType := "short"
+	if extended {
+		smartTestType = "long"
+	}
 	return []satJob{
 		{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
-		{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
+		{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
 	}
 }

@@ -774,6 +1167,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 		// nvidia-smi on a machine with no NVIDIA GPU
 		strings.Contains(text, "couldn't communicate with the nvidia driver") ||
 		strings.Contains(text, "no nvidia gpu") ||
+		// Some NVMe firmwares start self-test but never expose progress to nvme-cli
+		// while waiting, so the CLI stops polling without proving device failure.
+		(strings.Contains(name, "self-test") &&
+			strings.Contains(text, "no progress for") &&
+			strings.Contains(text, "stop waiting")) ||
 		(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
 		return "UNSUPPORTED", rc
 	}
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -20,7 +20,7 @@ type FanStressOptions struct {
 	Phase1DurSec int   // first load phase duration in seconds (default 300)
 	PauseSec     int   // pause between the two load phases (default 60)
 	Phase2DurSec int   // second load phase duration in seconds (default 300)
-	SizeMB       int   // GPU memory to allocate per GPU during stress (default 64)
+	SizeMB       int   // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
 	GPUIndices   []int // which GPU indices to stress (empty = all detected)
 }

@@ -243,9 +243,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
 	if opts.Phase2DurSec <= 0 {
 		opts.Phase2DurSec = 300
 	}
-	if opts.SizeMB <= 0 {
-		opts.SizeMB = 64
-	}
+	// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
+	// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
 }

 // sampleFanStressRow collects all metrics for one telemetry sample.
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -1,23 +1,25 @@
 package platform

 import (
+	"context"
 	"errors"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )

 func TestStorageSATCommands(t *testing.T) {
 	t.Parallel()

-	nvme := storageSATCommands("/dev/nvme0n1")
+	nvme := storageSATCommands("/dev/nvme0n1", false)
 	if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
 		t.Fatalf("unexpected nvme commands: %#v", nvme)
 	}

-	sata := storageSATCommands("/dev/sda")
+	sata := storageSATCommands("/dev/sda", false)
 	if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
 		t.Fatalf("unexpected sata commands: %#v", sata)
 	}
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {

 	jobs := nvidiaSATJobs()

-	if len(jobs) != 5 {
-		t.Fatalf("jobs=%d want 5", len(jobs))
+	if len(jobs) != 6 {
+		t.Fatalf("jobs=%d want 6", len(jobs))
 	}
-	if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
+	if got := jobs[0].cmd[0]; got != "nvidia-smi" {
+		t.Fatalf("preflight command=%q want nvidia-smi", got)
+	}
+	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+	}
+	if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
 		t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
 	}
-	if got := jobs[3].cmd[1]; got != "--output-file" {
+	if got := jobs[4].cmd[1]; got != "--output-file" {
 		t.Fatalf("bug report flag=%q want --output-file", got)
 	}
 }
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {

 func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	jobs := nvidiaSATJobs()
-	got := jobs[4].cmd
+	got := jobs[5].cmd
 	want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
 	if len(got) != len(want) {
 		t.Fatalf("cmd len=%d want %d", len(got), len(want))
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
+	jobs := nvidiaDCGMJobs(3, []int{2, 0})
+	if len(jobs) != 5 {
+		t.Fatalf("jobs=%d want 5", len(jobs))
+	}
+	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+	}
+	if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
+		t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
+	}
+}
+
 func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
 	t.Parallel()

@@ -195,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
 	}
 }

+func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
+	t.Parallel()
+
+	got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
+	if len(got) != 2 {
+		t.Fatalf("len=%d want 2", len(got))
+	}
+	if got[0].NeedsReset {
+		t.Fatalf("gpu0 unexpectedly marked reset-required")
+	}
+	if !got[1].NeedsReset {
+		t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
+	}
+}
+
+func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	msg, err := checkNvidiaJobHealth([]int{1})
+	if err == nil {
+		t.Fatal("expected health check error")
+	}
+	if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
+		t.Fatalf("unexpected message: %q", msg)
+	}
+}
+
+func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
+	dir := t.TempDir()
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	perGPU := map[int]*nvidiaGPUStatusFile{
+		0: {Index: 0, RunStatus: "OK"},
+		1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
+	}
+	if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
+		t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
+	}
+	raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
+	if err != nil {
+		t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
+	}
+	text := string(raw)
+	if !strings.Contains(text, "run_status=FAILED") {
+		t.Fatalf("missing run status:\n%s", text)
+	}
+	if !strings.Contains(text, "health_status=RESET_REQUIRED") {
+		t.Fatalf("missing health status:\n%s", text)
+	}
+	if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
+		t.Fatalf("missing failing job:\n%s", text)
+	}
+}
+
 func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
 	oldLookPath := satLookPath
 	satLookPath = func(file string) (string, error) {
@@ -234,11 +323,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {

 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
-	if len(env) != 1 {
-		t.Fatalf("env len=%d want 1 (%v)", len(env), env)
+	if len(env) != 2 {
+		t.Fatalf("env len=%d want 2 (%v)", len(env), env)
 	}
-	if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
-		t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
+	if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
+		t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
+	}
+	if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
+		t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
 	}
 }

@@ -276,6 +368,37 @@ func TestEnvIntFallback(t *testing.T) {
 	}
 }

+func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+
+	if got := memoryStressSizeArg(); got != "65536M" {
+		t.Fatalf("sizeArg=%q want 65536M", got)
+	}
+}
+
+func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+	t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
+
+	if got := memoryStressSizeArg(); got != "4096M" {
+		t.Fatalf("sizeArg=%q want 4096M", got)
+	}
+}
+
+func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 0 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+
+	if got := memoryStressSizeArg(); got != "80%" {
+		t.Fatalf("sizeArg=%q want 80%%", got)
+	}
+}
+
 func TestClassifySATResult(t *testing.T) {
 	tests := []struct {
 		name   string
@@ -286,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
 	}{
 		{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
 		{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
+		{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 		{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
 		{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 	}
@@ -300,6 +424,38 @@ func TestClassifySATResult(t *testing.T) {
 	}
 }

+func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
+	dir := t.TempDir()
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+
+	done := make(chan struct{})
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		cancel()
+		close(done)
+	}()
+
+	archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
+		{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
+	}, nil)
+	<-done
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("err=%v want context.Canceled", err)
+	}
+	if archive != "" {
+		t.Fatalf("archive=%q want empty", archive)
+	}
+	matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
+	if globErr != nil {
+		t.Fatalf("Glob error: %v", globErr)
+	}
+	if len(matches) != 0 {
+		t.Fatalf("archives=%v want none", matches)
+	}
+}
+
 func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -61,7 +61,9 @@ func (s *System) ServiceState(name string) string {
 }

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
-	raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
+	// bee-web runs as the bee user; sudo is required to control system services.
+	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
+	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
 	return string(raw), err
 }

--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -20,6 +20,7 @@ type RuntimeHealth struct {
 	ExportDir     string                 `json:"export_dir,omitempty"`
 	DriverReady   bool                   `json:"driver_ready,omitempty"`
 	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
+	NvidiaGSPMode string                 `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string                 `json:"network_status,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -11,6 +11,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -21,13 +22,305 @@ import (
 )

 var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
+var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
+	if a == nil {
+		return nil, fmt.Errorf("app not configured")
+	}
+	return a.ListNvidiaGPUs()
+}
+var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
+	if a == nil {
+		return nil, fmt.Errorf("app not configured")
+	}
+	return a.ListNvidiaGPUStatuses()
+}

 // ── Job ID counter ────────────────────────────────────────────────────────────

 var jobCounter atomic.Uint64

-func newJobID(prefix string) string {
-	return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
+func newJobID(_ string) string {
+	start := int((jobCounter.Add(1) - 1) % 1000)
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	for offset := 0; offset < 1000; offset++ {
+		n := (start + offset) % 1000
+		id := fmt.Sprintf("TASK-%03d", n)
+		if !taskIDInUseLocked(id) {
+			return id
+		}
+	}
+	return fmt.Sprintf("TASK-%03d", start)
+}
+
+func taskIDInUseLocked(id string) bool {
+	for _, t := range globalQueue.tasks {
+		if t != nil && t.ID == id {
+			return true
+		}
+	}
+	return false
+}
+
+type taskRunResponse struct {
+	TaskID    string   `json:"task_id,omitempty"`
+	JobID     string   `json:"job_id,omitempty"`
+	TaskIDs   []string `json:"task_ids,omitempty"`
+	JobIDs    []string `json:"job_ids,omitempty"`
+	TaskCount int      `json:"task_count,omitempty"`
+}
+
+type nvidiaTaskSelection struct {
+	GPUIndices []int
+	Label      string
+}
+
+func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
+	if len(tasks) == 0 {
+		writeJSON(w, taskRunResponse{})
+		return
+	}
+	ids := make([]string, 0, len(tasks))
+	for _, t := range tasks {
+		if t == nil || strings.TrimSpace(t.ID) == "" {
+			continue
+		}
+		ids = append(ids, t.ID)
+	}
+	resp := taskRunResponse{TaskCount: len(ids)}
+	if len(ids) > 0 {
+		resp.TaskID = ids[0]
+		resp.JobID = ids[0]
+		resp.TaskIDs = ids
+		resp.JobIDs = ids
+	}
+	writeJSON(w, resp)
+}
+
+func shouldSplitHomogeneousNvidiaTarget(target string) bool {
+	switch strings.TrimSpace(target) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
+		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
+		"nvidia-bandwidth", "nvidia-stress":
+		return true
+	default:
+		return false
+	}
+}
+
+func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
+	if len(gpus) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs detected")
+	}
+	indexed := make(map[int]platform.NvidiaGPU, len(gpus))
+	allIndices := make([]int, 0, len(gpus))
+	for _, gpu := range gpus {
+		indexed[gpu.Index] = gpu
+		allIndices = append(allIndices, gpu.Index)
+	}
+	sort.Ints(allIndices)
+
+	selected := allIndices
+	if len(include) > 0 {
+		selected = make([]int, 0, len(include))
+		seen := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			if _, ok := indexed[idx]; !ok {
+				continue
+			}
+			if _, dup := seen[idx]; dup {
+				continue
+			}
+			seen[idx] = struct{}{}
+			selected = append(selected, idx)
+		}
+		sort.Ints(selected)
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected")
+	}
+
+	modelGroups := make(map[string][]platform.NvidiaGPU)
+	modelOrder := make([]string, 0)
+	for _, idx := range selected {
+		gpu := indexed[idx]
+		model := strings.TrimSpace(gpu.Name)
+		if model == "" {
+			model = fmt.Sprintf("GPU %d", gpu.Index)
+		}
+		if _, ok := modelGroups[model]; !ok {
+			modelOrder = append(modelOrder, model)
+		}
+		modelGroups[model] = append(modelGroups[model], gpu)
+	}
+	sort.Slice(modelOrder, func(i, j int) bool {
+		left := modelGroups[modelOrder[i]]
+		right := modelGroups[modelOrder[j]]
+		if len(left) == 0 || len(right) == 0 {
+			return modelOrder[i] < modelOrder[j]
+		}
+		return left[0].Index < right[0].Index
+	})
+
+	var groups []nvidiaTaskSelection
+	var singles []nvidiaTaskSelection
+	for _, model := range modelOrder {
+		group := modelGroups[model]
+		sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
+		indices := make([]int, 0, len(group))
+		for _, gpu := range group {
+			indices = append(indices, gpu.Index)
+		}
+		if len(indices) >= 2 {
+			groups = append(groups, nvidiaTaskSelection{
+				GPUIndices: indices,
+				Label:      fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
+			})
+			continue
+		}
+		gpu := group[0]
+		singles = append(singles, nvidiaTaskSelection{
+			GPUIndices: []int{gpu.Index},
+			Label:      fmt.Sprintf("GPU %d — %s", gpu.Index, model),
+		})
+	}
+	return append(groups, singles...), nil
+}
+
+func joinTaskIndices(indices []int) string {
+	parts := make([]string, 0, len(indices))
+	for _, idx := range indices {
+		parts = append(parts, fmt.Sprintf("%d", idx))
+	}
+	return strings.Join(parts, ",")
+}
+
+func formatSplitTaskName(baseName, selectionLabel string) string {
+	baseName = strings.TrimSpace(baseName)
+	selectionLabel = strings.TrimSpace(selectionLabel)
+	if baseName == "" {
+		return selectionLabel
+	}
+	if selectionLabel == "" {
+		return baseName
+	}
+	return baseName + " (" + selectionLabel + ")"
+}
+
+func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
+	if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs {
+		// Parallel mode (or non-splittable target): one task for all selected GPUs.
+		if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) {
+			// Resolve the selected GPU indices so ExcludeGPUIndices is applied.
+			gpus, err := apiListNvidiaGPUs(appRef)
+			if err != nil {
+				return nil, err
+			}
+			resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices)
+			if err != nil {
+				return nil, err
+			}
+			params.GPUIndices = resolved
+			params.ExcludeGPUIndices = nil
+		}
+		t := &Task{
+			ID:        newJobID(idPrefix),
+			Name:      baseName,
+			Target:    target,
+			Priority:  priority,
+			Status:    TaskPending,
+			CreatedAt: createdAt,
+			params:    params,
+		}
+		return []*Task{t}, nil
+	}
+	gpus, err := apiListNvidiaGPUs(appRef)
+	if err != nil {
+		return nil, err
+	}
+	selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
+	if err != nil {
+		return nil, err
+	}
+	tasks := make([]*Task, 0, len(selections))
+	for _, selection := range selections {
+		taskParamsCopy := params
+		taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
+		taskParamsCopy.ExcludeGPUIndices = nil
+		displayName := formatSplitTaskName(baseName, selection.Label)
+		taskParamsCopy.DisplayName = displayName
+		tasks = append(tasks, &Task{
+			ID:        newJobID(idPrefix),
+			Name:      displayName,
+			Target:    target,
+			Priority:  priority,
+			Status:    TaskPending,
+			CreatedAt: createdAt,
+			params:    taskParamsCopy,
+		})
+	}
+	return tasks, nil
+}
+
+// expandSelectedGPUIndices returns the sorted list of selected GPU indices after
+// applying include/exclude filters, without splitting by model.
+func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) {
+	indexed := make(map[int]struct{}, len(gpus))
+	allIndices := make([]int, 0, len(gpus))
+	for _, gpu := range gpus {
+		indexed[gpu.Index] = struct{}{}
+		allIndices = append(allIndices, gpu.Index)
+	}
+	sort.Ints(allIndices)
+
+	selected := allIndices
+	if len(include) > 0 {
+		selected = make([]int, 0, len(include))
+		seen := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			if _, ok := indexed[idx]; !ok {
+				continue
+			}
+			if _, dup := seen[idx]; dup {
+				continue
+			}
+			seen[idx] = struct{}{}
+			selected = append(selected, idx)
+		}
+		sort.Ints(selected)
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected")
+	}
+	return selected, nil
 }

 // ── SSE helpers ───────────────────────────────────────────────────────────────
@@ -191,7 +484,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {

 		var body struct {
 			Duration           int      `json:"duration"`
-			DiagLevel          int      `json:"diag_level"`
+			StressMode         bool     `json:"stress_mode"`
 			GPUIndices         []int    `json:"gpu_indices"`
 			ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
 			Loader             string   `json:"loader"`
@@ -207,28 +500,28 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		}

 		name := taskDisplayName(target, body.Profile, body.Loader)
-		t := &Task{
-			ID:        newJobID("sat-" + target),
-			Name:      name,
-			Target:    target,
-			Status:    TaskPending,
-			CreatedAt: time.Now(),
-			params: taskParams{
-				Duration:           body.Duration,
-				DiagLevel:          body.DiagLevel,
-				GPUIndices:         body.GPUIndices,
-				ExcludeGPUIndices:  body.ExcludeGPUIndices,
-				Loader:             body.Loader,
-				BurnProfile:        body.Profile,
-				DisplayName:        body.DisplayName,
-				PlatformComponents: body.PlatformComponents,
-			},
-		}
 		if strings.TrimSpace(body.DisplayName) != "" {
-			t.Name = body.DisplayName
+			name = body.DisplayName
 		}
-		globalQueue.enqueue(t)
-		writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+		params := taskParams{
+			Duration:           body.Duration,
+			StressMode:         body.StressMode,
+			GPUIndices:         body.GPUIndices,
+			ExcludeGPUIndices:  body.ExcludeGPUIndices,
+			Loader:             body.Loader,
+			BurnProfile:        body.Profile,
+			DisplayName:        body.DisplayName,
+			PlatformComponents: body.PlatformComponents,
+		}
+		tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
+		if err != nil {
+			writeError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		for _, t := range tasks {
+			globalQueue.enqueue(t)
+		}
+		writeTaskRunResponse(w, tasks)
 	}
 }

@@ -244,6 +537,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		GPUIndices        []int  `json:"gpu_indices"`
 		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
 		RunNCCL           *bool  `json:"run_nccl"`
+		ParallelGPUs      *bool  `json:"parallel_gpus"`
 		DisplayName       string `json:"display_name"`
 	}
 	if r.Body != nil {
@@ -257,27 +551,31 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 	if body.RunNCCL != nil {
 		runNCCL = *body.RunNCCL
 	}
-	t := &Task{
-		ID:        newJobID("benchmark-nvidia"),
-		Name:      taskDisplayName("nvidia-benchmark", "", ""),
-		Target:    "nvidia-benchmark",
-		Priority:  15,
-		Status:    TaskPending,
-		CreatedAt: time.Now(),
-		params: taskParams{
-			GPUIndices:        body.GPUIndices,
-			ExcludeGPUIndices: body.ExcludeGPUIndices,
-			SizeMB:            body.SizeMB,
-			BenchmarkProfile:  body.Profile,
-			RunNCCL:           runNCCL,
-			DisplayName:       body.DisplayName,
-		},
+	parallelGPUs := false
+	if body.ParallelGPUs != nil {
+		parallelGPUs = *body.ParallelGPUs
 	}
+	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
-		t.Name = body.DisplayName
+		name = body.DisplayName
 	}
-	globalQueue.enqueue(t)
-	writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
+		GPUIndices:        body.GPUIndices,
+		ExcludeGPUIndices: body.ExcludeGPUIndices,
+		SizeMB:            body.SizeMB,
+		BenchmarkProfile:  body.Profile,
+		RunNCCL:           runNCCL,
+		ParallelGPUs:      parallelGPUs,
+		DisplayName:       body.DisplayName,
+	}, name, h.opts.App, "benchmark-nvidia")
+	if err != nil {
+		writeError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	for _, t := range tasks {
+		globalQueue.enqueue(t)
+	}
+	writeTaskRunResponse(w, tasks)
 }

 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
@@ -383,11 +681,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
 		return
 	}
 	result, err := h.opts.App.ServiceActionResult(req.Name, action)
+	status := "ok"
 	if err != nil {
-		writeError(w, http.StatusInternalServerError, err.Error())
-		return
+		status = "error"
 	}
-	writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
+	// Always return 200 with output so the frontend can display the actual
+	// systemctl error message instead of a generic "exit status 1".
+	writeJSON(w, map[string]string{"status": status, "output": result.Body})
 }

 // ── Network ───────────────────────────────────────────────────────────────────
@@ -555,6 +855,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
 	writeJSON(w, gpus)
 }

+func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	if gpus == nil {
+		gpus = []platform.NvidiaGPUStatus{}
+	}
+	writeJSON(w, gpus)
+}
+
+func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	var req struct {
+		Index int `json:"index"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	result, err := h.opts.App.ResetNvidiaGPU(req.Index)
+	status := "ok"
+	if err != nil {
+		status = "error"
+	}
+	writeJSON(w, map[string]string{"status": status, "output": result.Body})
+}
+
 func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -1,6 +1,7 @@
 package webui

 import (
+	"encoding/json"
 	"net/http/httptest"
 	"strings"
 	"testing"
@@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 3, Name: "NVIDIA H100 PCIe"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })

 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
@@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	}
 }

+func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRun(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var resp taskRunResponse
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode response: %v", err)
+	}
+	if len(resp.TaskIDs) != 2 {
+		t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
+func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
 	h := &handler{}
 	h.pushFanRings([]platform.FanReading{
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -6,6 +6,7 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
 	"time"

 	"bee/audit/internal/platform"
@@ -52,6 +53,12 @@ var metricChartPalette = []string{
 	"#ffbe5c",
 }

+var gpuLabelCache struct {
+	mu       sync.Mutex
+	loadedAt time.Time
+	byIndex  map[int]string
+}
+
 func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
 	pointCount := len(labels)
 	if len(times) > pointCount {
@@ -76,15 +83,7 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
 		}
 	}

-	mn, avg, mx := globalStats(datasets)
-	if mx > 0 {
-		title = fmt.Sprintf("%s    ↓%s  ~%s  ↑%s",
-			title,
-			chartLegendNumber(mn),
-			chartLegendNumber(avg),
-			chartLegendNumber(mx),
-		)
-	}
+	statsLabel := chartStatsLabel(datasets)

 	legendItems := []metricChartSeries{}
 	for i, name := range names {
@@ -106,7 +105,7 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data

 	var b strings.Builder
 	writeSVGOpen(&b, layout.Width, layout.Height)
-	writeChartFrame(&b, title, layout.Width, layout.Height)
+	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scale)
@@ -126,21 +125,19 @@ func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, tim
 	temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 	power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 	coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
-	memClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
-	if temp == nil && power == nil && coreClock == nil && memClock == nil {
+	if temp == nil && power == nil && coreClock == nil {
 		return nil, false, nil
 	}
 	labels := sampleTimeLabels(samples)
 	times := sampleTimes(samples)
 	svg, err := drawGPUOverviewChartSVG(
-		fmt.Sprintf("GPU %d Overview", idx),
+		gpuDisplayLabel(idx)+" Overview",
 		labels,
 		times,
 		[]metricChartSeries{
 			{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
 			{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
 			{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
-			{Name: "Memory Clock MHz", Values: coalesceDataset(memClock, len(labels)), Color: "#5794f2", AxisTitle: "Memory MHz"},
 		},
 		timeline,
 	)
@@ -151,8 +148,8 @@ func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, tim
 }

 func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
-	if len(series) != 4 {
-		return nil, fmt.Errorf("gpu overview requires 4 series, got %d", len(series))
+	if len(series) != 3 {
+		return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
 	}
 	const (
 		width      = 1400
@@ -166,7 +163,6 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
 		leftOuterAxis  = 72
 		leftInnerAxis  = 132
 		rightInnerAxis = 1268
-		rightOuterAxis = 1328
 	)
 	layout := chartLayout{
 		Width:      width,
@@ -176,7 +172,7 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
 		PlotTop:    plotTop,
 		PlotBottom: plotBottom,
 	}
-	axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis, rightOuterAxis}
+	axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
@@ -214,7 +210,7 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s

 	var b strings.Builder
 	writeSVGOpen(&b, width, height)
-	writeChartFrame(&b, title, width, height)
+	writeChartFrame(&b, title, "", width, height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scales[0])
@@ -457,10 +453,14 @@ func writeSVGClose(b *strings.Builder) {
 	b.WriteString("</svg>\n")
 }

-func writeChartFrame(b *strings.Builder, title string, width, height int) {
+func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
 	fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
 	fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
 		width/2, sanitizeChartText(title))
+	if strings.TrimSpace(subtitle) != "" {
+		fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
+			width/2, sanitizeChartText(subtitle))
+	}
 }

 func writePlotBorder(b *strings.Builder, layout chartLayout) {
@@ -545,7 +545,21 @@ func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Ti
 		x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
 		fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
+		return
 	}
+	peakIdx := 0
+	peakValue := values[0]
+	for idx, value := range values[1:] {
+		if value >= peakValue {
+			peakIdx = idx + 1
+			peakValue = value
+		}
+	}
+	x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
+	y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
+	fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
+	fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
+		x, y-10, x-5, y-18, x+5, y-18, color)
 }

 func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
@@ -711,3 +725,49 @@ func valueClamp(value float64, scale chartScale) float64 {
 	}
 	return value
 }
+
+func chartStatsLabel(datasets [][]float64) string {
+	mn, avg, mx := globalStats(datasets)
+	if mx <= 0 && avg <= 0 && mn <= 0 {
+		return ""
+	}
+	return fmt.Sprintf("min %s   avg %s   max %s",
+		chartLegendNumber(mn),
+		chartLegendNumber(avg),
+		chartLegendNumber(mx),
+	)
+}
+
+func gpuDisplayLabel(idx int) string {
+	if name := gpuModelNameByIndex(idx); name != "" {
+		return fmt.Sprintf("GPU %d — %s", idx, name)
+	}
+	return fmt.Sprintf("GPU %d", idx)
+}
+
+func gpuModelNameByIndex(idx int) string {
+	now := time.Now()
+	gpuLabelCache.mu.Lock()
+	if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
+		gpuLabelCache.loadedAt = now
+		gpuLabelCache.byIndex = loadGPUModelNames()
+	}
+	name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
+	gpuLabelCache.mu.Unlock()
+	return name
+}
+
+func loadGPUModelNames() map[int]string {
+	out := map[int]string{}
+	gpus, err := platform.New().ListNvidiaGPUs()
+	if err != nil {
+		return out
+	}
+	for _, gpu := range gpus {
+		name := strings.TrimSpace(gpu.Name)
+		if name != "" {
+			out[gpu.Index] = name
+		}
+	}
+	return out
+}
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -9,13 +9,14 @@ import (

 // jobState holds the output lines and completion status of an async job.
 type jobState struct {
-	lines   []string
-	done    bool
-	err     string
-	mu      sync.Mutex
-	subs    []chan string
-	cancel  func() // optional cancel function; nil if job is not cancellable
-	logPath string
+	lines        []string
+	done         bool
+	err          string
+	mu           sync.Mutex
+	subs         []chan string
+	cancel       func() // optional cancel function; nil if job is not cancellable
+	logPath      string
+	serialPrefix string
 }

 // abort cancels the job if it has a cancel function and is not yet done.
@@ -36,6 +37,9 @@ func (j *jobState) append(line string) {
 	if j.logPath != "" {
 		appendJobLog(j.logPath, line)
 	}
+	if j.serialPrefix != "" {
+		taskSerialWriteLine(j.serialPrefix + line)
+	}
 	for _, ch := range j.subs {
 		select {
 		case ch <- line:
@@ -107,8 +111,11 @@ func (m *jobManager) get(id string) (*jobState, bool) {
 	return j, ok
 }

-func newTaskJobState(logPath string) *jobState {
+func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	j := &jobState{logPath: logPath}
+	if len(serialPrefix) > 0 {
+		j.serialPrefix = serialPrefix[0]
+	}
 	if logPath == "" {
 		return j
 	}
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -22,6 +22,13 @@ type MetricsDB struct {
 	db *sql.DB
 }

+func (m *MetricsDB) Close() error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	return m.db.Close()
+}
+
 // openMetricsDB opens (or creates) the metrics database at the given path.
 func openMetricsDB(path string) (*MetricsDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
@@ -164,6 +171,23 @@ func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
 	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
 }

+// LoadBetween returns samples in chronological order within the given time window.
+func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
+	if m == nil {
+		return nil, nil
+	}
+	if start.IsZero() || end.IsZero() {
+		return nil, nil
+	}
+	if end.Before(start) {
+		start, end = end, start
+	}
+	return m.loadSamples(
+		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
+		start.Unix(), end.Unix(),
+	)
+}
+
 // loadSamples reconstructs LiveMetricSample rows from the normalized tables.
 func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
 	rows, err := m.db.Query(query, args...)
@@ -364,9 +388,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	return cw.Error()
 }

-// Close closes the database.
-func (m *MetricsDB) Close() { _ = m.db.Close() }
-
 func nullFloat(v float64) sql.NullFloat64 {
 	return sql.NullFloat64{Float64: v, Valid: true}
 }
--- a/audit/internal/webui/metricsdb_test.go
+++ b/audit/internal/webui/metricsdb_test.go
@@ -143,3 +143,32 @@ CREATE TABLE temp_metrics (
 		t.Fatalf("MemClockMHz=%v want 2600", got)
 	}
 }
+
+func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
+	db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	base := time.Unix(1_700_000_000, 0).UTC()
+	for i := 0; i < 5; i++ {
+		if err := db.Write(platform.LiveMetricSample{
+			Timestamp:  base.Add(time.Duration(i) * time.Minute),
+			CPULoadPct: float64(i),
+		}); err != nil {
+			t.Fatalf("Write(%d): %v", i, err)
+		}
+	}
+
+	got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
+	if err != nil {
+		t.Fatalf("LoadBetween: %v", err)
+	}
+	if len(got) != 3 {
+		t.Fatalf("LoadBetween len=%d want 3", len(got))
+	}
+	if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
+		t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
+	}
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/serial_console.go
+++ b/audit/internal/webui/serial_console.go
@@ -0,0 +1,41 @@
+package webui
+
+import (
+	"fmt"
+	"os"
+	"strings"
+	"time"
+)
+
+var taskSerialWriteLine = writeTaskSerialLine
+
+func writeTaskSerialLine(line string) {
+	line = strings.TrimSpace(line)
+	if line == "" {
+		return
+	}
+	payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
+	for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
+		f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
+		if err != nil {
+			continue
+		}
+		_, _ = f.WriteString(payload)
+		_ = f.Close()
+		return
+	}
+}
+
+func taskSerialPrefix(t *Task) string {
+	if t == nil {
+		return "[task] "
+	}
+	return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
+}
+
+func taskSerialEvent(t *Task, event string) {
+	if t == nil {
+		return
+	}
+	taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
+}
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -221,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// ── Infrastructure ──────────────────────────────────────────────────────
 	mux.HandleFunc("GET /healthz", h.handleHealthz)
 	mux.HandleFunc("GET /api/ready", h.handleReady)
+	mux.HandleFunc("GET /loading", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Cache-Control", "no-store")
+		w.Header().Set("Content-Type", "text/html; charset=utf-8")
+		_, _ = w.Write([]byte(loadingPageHTML))
+	})

 	// ── Existing read-only endpoints (preserved for compatibility) ──────────
 	mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
@@ -265,6 +270,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
 	mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
 	mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
+	mux.HandleFunc("GET /api/tasks/{id}/charts", h.handleAPITaskChartsIndex)
+	mux.HandleFunc("GET /api/tasks/{id}/chart/", h.handleAPITaskChartSVG)
+	mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)

 	// Services
 	mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
@@ -294,6 +302,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
 	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
+	mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
+	mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
 	mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)

 	// System
@@ -703,7 +713,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		}
 		switch sub {
 		case "load":
-			title = fmt.Sprintf("GPU %d Load", idx)
+			title = gpuDisplayLabel(idx) + " Load"
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
@@ -714,7 +724,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			yMin = floatPtr(0)
 			yMax = floatPtr(100)
 		case "temp":
-			title = fmt.Sprintf("GPU %d Temperature", idx)
+			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
 				return nil, nil, nil, "", nil, nil, false
@@ -724,7 +734,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			yMin = floatPtr(0)
 			yMax = autoMax120(temp)
 		case "clock":
-			title = fmt.Sprintf("GPU %d Core Clock", idx)
+			title = gpuDisplayLabel(idx) + " Core Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 			if clock == nil {
 				return nil, nil, nil, "", nil, nil, false
@@ -733,7 +743,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			names = []string{"Core Clock MHz"}
 			yMin, yMax = autoBounds120(clock)
 		case "memclock":
-			title = fmt.Sprintf("GPU %d Memory Clock", idx)
+			title = gpuDisplayLabel(idx) + " Memory Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 			if clock == nil {
 				return nil, nil, nil, "", nil, nil, false
@@ -742,7 +752,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			names = []string{"Memory Clock MHz"}
 			yMin, yMax = autoBounds120(clock)
 		default:
-			title = fmt.Sprintf("GPU %d Power", idx)
+			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
 				return nil, nil, nil, "", nil, nil, false
@@ -871,7 +881,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
 			continue
 		}
 		datasets = append(datasets, ds)
-		names = append(names, fmt.Sprintf("GPU %d", idx))
+		names = append(names, gpuDisplayLabel(idx))
 	}
 	return datasets, names
 }
@@ -1206,37 +1216,106 @@ const loadingPageHTML = `<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="UTF-8">
-<title>EASY-BEE</title>
+<title>EASY-BEE — Starting</title>
 <style>
 *{margin:0;padding:0;box-sizing:border-box}
 html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
-.logo{font-size:13px;line-height:1.4;color:#f6c90e;margin-bottom:48px;white-space:pre}
-.spinner{width:48px;height:48px;border:4px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 24px}
+.wrap{text-align:center;width:420px}
+.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
+.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
+.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
+.spinner.hidden{display:none}
@keyframes spin{to{transform:rotate(360deg)}}
-.status{font-size:14px;color:#a0aec0;letter-spacing:.05em}
+.status{font-size:13px;color:#a0aec0;margin-bottom:20px;min-height:18px}
+table{width:100%;border-collapse:collapse;font-size:12px;margin-bottom:20px;display:none}
+td{padding:3px 6px;text-align:left}
+td:first-child{color:#718096;width:55%}
+.ok{color:#68d391}
+.run{color:#f6c90e}
+.fail{color:#fc8181}
+.dim{color:#4a5568}
+.btn{background:#1a202c;color:#a0aec0;border:1px solid #2d3748;padding:7px 18px;font-size:12px;cursor:pointer;font-family:inherit;display:none}
+.btn:hover{border-color:#718096;color:#e2e8f0}
 </style>
 </head>
 <body>
-<div style="text-align:center">
+<div class="wrap">
  <div class="logo">  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗
  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝
  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗
  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝
  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗
  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝</div>
-  <div class="spinner"></div>
-  <div class="status" id="s">Starting up...</div>
+  <div class="subtitle">Hardware Audit LiveCD</div>
+  <div class="spinner" id="spin"></div>
+  <div class="status" id="st">Connecting to bee-web...</div>
+  <table id="tbl"></table>
+  <button class="btn" id="btn" onclick="go()">Open app now</button>
 </div>
 <script>
-function probe(){
-  fetch('/api/ready',{cache:'no-store'})
-    .then(function(r){
-      if(r.ok){window.location.replace('/');}
-      else{setTimeout(probe,1000);}
+(function(){
+var gone = false;
+function go(){ if(!gone){gone=true;window.location.replace('/');} }
+
+function icon(s){
+  if(s==='active')   return '<span class="ok">&#9679; active</span>';
+  if(s==='failed')   return '<span class="fail">&#10005; failed</span>';
+  if(s==='activating'||s==='reloading') return '<span class="run">&#9675; starting</span>';
+  if(s==='inactive') return '<span class="dim">&#9675; inactive</span>';
+  return '<span class="dim">'+s+'</span>';
+}
+
+function allSettled(svcs){
+  for(var i=0;i<svcs.length;i++){
+    var s=svcs[i].state;
+    if(s!=='active'&&s!=='failed'&&s!=='inactive') return false;
+  }
+  return true;
+}
+
+var pollTimer=null;
+
+function pollServices(){
+  fetch('/api/services',{cache:'no-store'})
+    .then(function(r){return r.json();})
+    .then(function(svcs){
+      if(!svcs||!svcs.length) return;
+      var tbl=document.getElementById('tbl');
+      tbl.style.display='';
+      var html='';
+      for(var i=0;i<svcs.length;i++)
+        html+='<tr><td>'+svcs[i].name+'</td><td>'+icon(svcs[i].state)+'</td></tr>';
+      tbl.innerHTML=html;
+      if(allSettled(svcs)){
+        clearInterval(pollTimer);
+        document.getElementById('spin').className='spinner hidden';
+        document.getElementById('st').textContent='Ready \u2014 opening...';
+        setTimeout(go,800);
+      }
    })
-    .catch(function(){setTimeout(probe,1000);});
+    .catch(function(){});
+}
+
+function probe(){
+  fetch('/healthz',{cache:'no-store'})
+    .then(function(r){
+      if(r.ok){
+        document.getElementById('st').textContent='bee-web running \u2014 checking services...';
+        document.getElementById('btn').style.display='';
+        pollServices();
+        pollTimer=setInterval(pollServices,1500);
+      } else {
+        document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
+        setTimeout(probe,500);
+      }
+    })
+    .catch(function(){
+      document.getElementById('st').textContent='Waiting for bee-web to start...';
+      setTimeout(probe,500);
+    });
 }
 probe();
+})();
 </script>
 </body>
 </html>`
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -1,6 +1,7 @@
 package webui

 import (
+	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -184,15 +185,15 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 		{
 			Timestamp: time.Now().Add(-2 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
-				{GPUIndex: 0, ClockMHz: 1400, MemClockMHz: 2600},
-				{GPUIndex: 3, ClockMHz: 1500, MemClockMHz: 2800},
+				{GPUIndex: 0, ClockMHz: 1400},
+				{GPUIndex: 3, ClockMHz: 1500},
 			},
 		},
 		{
 			Timestamp: time.Now().Add(-1 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
-				{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2610},
-				{GPUIndex: 3, ClockMHz: 1510, MemClockMHz: 2810},
+				{GPUIndex: 0, ClockMHz: 1410},
+				{GPUIndex: 3, ClockMHz: 1510},
 			},
 		},
 	}
@@ -210,20 +211,6 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 	if got := datasets[1][1]; got != 1510 {
 		t.Fatalf("GPU 3 core clock=%v want 1510", got)
 	}
-
-	datasets, names, _, title, _, _, ok = chartDataFromSamples("gpu-all-memclock", samples)
-	if !ok {
-		t.Fatal("gpu-all-memclock returned ok=false")
-	}
-	if title != "GPU Memory Clock" {
-		t.Fatalf("title=%q", title)
-	}
-	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
-		t.Fatalf("names=%v", names)
-	}
-	if got := datasets[0][0]; got != 2600 {
-		t.Fatalf("GPU 0 memory clock=%v want 2600", got)
-	}
 }

 func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
@@ -256,10 +243,10 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
 	if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
 		t.Fatalf("metrics page should include GPU core clock chart: %s", body)
 	}
-	if !strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
-		t.Fatalf("metrics page should include GPU memory clock chart: %s", body)
+	if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
+		t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
 	}
-	if !strings.Contains(body, `renderGPUOverviewCards(indices)`) {
+	if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
 		t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
 	}
 }
@@ -543,7 +530,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `Run Audit`) {
+	if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
 		t.Fatalf("dashboard missing run audit button: %s", body)
 	}
 	if strings.Contains(body, `No audit data`) {
@@ -585,7 +572,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	}
 }

-func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
+func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
@@ -593,8 +580,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `id="task-log-overlay"`) {
-		t.Fatalf("tasks page missing log modal overlay: %s", body)
+	if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
+		t.Fatalf("tasks page missing task report hint: %s", body)
 	}
 	if !strings.Contains(body, `_taskPageSize = 50`) {
 		t.Fatalf("tasks page missing pagination size config: %s", body)
@@ -604,7 +591,7 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
 	}
 }

-func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
+func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
@@ -612,11 +599,20 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
+	if !strings.Contains(body, `NVIDIA Self Heal`) {
+		t.Fatalf("tools page missing nvidia self heal section: %s", body)
+	}
 	if !strings.Contains(body, `Restart GPU Drivers`) {
 		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
 	}
-	if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
-		t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
+	if !strings.Contains(body, `nvidiaRestartDrivers()`) {
+		t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
+	}
+	if !strings.Contains(body, `/api/gpu/nvidia-status`) {
+		t.Fatalf("tools page missing nvidia status api usage: %s", body)
+	}
+	if !strings.Contains(body, `nvidiaResetGPU(`) {
+		t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
 	}
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
@@ -650,7 +646,67 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 	}
 }

-func TestValidatePageRendersNvidiaTargetedStressAction(t *testing.T) {
+func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
+	dir := t.TempDir()
+	exportDir := filepath.Join(dir, "export")
+	runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	result := platform.NvidiaBenchmarkResult{
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
+		BenchmarkProfile: "standard",
+		OverallStatus:    "OK",
+		GPUs: []platform.BenchmarkGPUResult{
+			{
+				Index: 0,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1176.25,
+				},
+			},
+			{
+				Index: 1,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1168.50,
+				},
+			},
+		},
+	}
+	raw, err := json.Marshal(result)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{ExportDir: exportDir})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
+	for _, needle := range []string{
+		`Benchmark Results`,
+		`Composite score by saved benchmark run and GPU.`,
+		`GPU #0 — NVIDIA H100 PCIe`,
+		`GPU #1 — NVIDIA H100 PCIe`,
+		`#1`,
+		wantTime,
+		`1176.25`,
+		`1168.50`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("benchmark page missing %q: %s", needle, body)
+		}
+	}
+}
+
+func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
@@ -659,9 +715,14 @@ func TestValidatePageRendersNvidiaTargetedStressAction(t *testing.T) {
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
-		`Targeted Stress`,
+		`NVIDIA GPU Targeted Stress`,
 		`nvidia-targeted-stress`,
-		`Official DCGM`,
+		`controlled NVIDIA DCGM load`,
+		`<code>dcgmi diag targeted_stress</code>`,
+		`NVIDIA GPU Selection`,
+		`All NVIDIA validate tasks use only the GPUs selected here.`,
+		`Select All`,
+		`id="sat-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
@@ -680,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	for _, needle := range []string{
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
-		`targeted_stress remain in <a href="/validate">Validate</a>`,
-		`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
+		`NCCL`,
+		`Validate → Stress mode`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
@@ -690,37 +751,154 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	}
 }

-func TestTasksPageRendersScrollableLogModal(t *testing.T) {
+func TestTaskDetailPageRendersSavedReport(t *testing.T) {
 	dir := t.TempDir()
-	path := filepath.Join(dir, "audit.json")
 	exportDir := filepath.Join(dir, "export")
-	if err := os.MkdirAll(exportDir, 0755); err != nil {
+	reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
+	if err := os.MkdirAll(reportDir, 0755); err != nil {
 		t.Fatal(err)
 	}
-	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
+	reportPath := filepath.Join(reportDir, "report.html")
+	if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
 		t.Fatal(err)
 	}

-	handler := NewHandler(HandlerOptions{
-		Title:     "Bee Hardware Audit",
-		AuditPath: path,
-		ExportDir: exportDir,
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:             "task-1",
+		Name:           "CPU SAT",
+		Target:         "cpu",
+		Status:         TaskDone,
+		CreatedAt:      time.Now(),
+		ArtifactsDir:   reportDir,
+		ReportHTMLPath: reportPath,
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
 	})

+	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
+
 	rec := httptest.NewRecorder()
-	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `height:calc(100vh - 32px)`) {
-		t.Fatalf("tasks page missing bounded log modal height: %s", body)
+	if !strings.Contains(body, `saved report`) {
+		t.Fatalf("task detail page missing saved report: %s", body)
 	}
-	if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
-		t.Fatalf("tasks page missing log modal overflow guard: %s", body)
+	if !strings.Contains(body, `Back to Tasks`) {
+		t.Fatalf("task detail page missing back link: %s", body)
 	}
-	if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
-		t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
+}
+
+func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "task-live-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `Cancel</button>`) {
+		t.Fatalf("task detail page missing cancel button: %s", body)
+	}
+	if !strings.Contains(body, `function cancelTaskDetail(id)`) {
+		t.Fatalf("task detail page missing cancel handler: %s", body)
+	}
+	if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
+		t.Fatalf("task detail page missing cancel endpoint: %s", body)
+	}
+	if !strings.Contains(body, `id="task-live-charts"`) {
+		t.Fatalf("task detail page missing live charts container: %s", body)
+	}
+	if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
+		t.Fatalf("task detail page missing live charts index endpoint: %s", body)
+	}
+}
+
+func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	db, err := openMetricsDB(metricsPath)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	base := time.Now().UTC()
+	samples := []platform.LiveMetricSample{
+		{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
+		{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
+		{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
+	}
+	for _, sample := range samples {
+		if err := db.Write(sample); err != nil {
+			t.Fatalf("Write: %v", err)
+		}
+	}
+	_ = db.Close()
+
+	started := base.Add(-2*time.Minute - 5*time.Second)
+	done := base.Add(-1*time.Minute + 5*time.Second)
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "task-chart-1",
+		Name:      "Power Window",
+		Target:    "cpu",
+		Status:    TaskDone,
+		CreatedAt: started.Add(-10 * time.Second),
+		StartedAt: &started,
+		DoneAt:    &done,
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
+	req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
+	req.SetPathValue("id", "task-chart-1")
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, req)
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, "System Power") {
+		t.Fatalf("task chart missing expected title: %s", body)
+	}
+	if !strings.Contains(body, "min 200") {
+		t.Fatalf("task chart stats should start from in-window sample: %s", body)
+	}
+	if strings.Contains(body, "min 100") {
+		t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
 	}
 }

@@ -845,3 +1023,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
 		t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
 	}
 }
+
+func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "audit.json")
+	exportDir := filepath.Join(dir, "export")
+	if err := os.MkdirAll(exportDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
+		t.Fatal(err)
+	}
+	health := `{
+  "status":"PARTIAL",
+  "checked_at":"2026-03-16T10:00:00Z",
+  "export_dir":"/tmp/export",
+  "driver_ready":true,
+  "cuda_ready":false,
+  "network_status":"PARTIAL",
+  "issues":[
+    {"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
+    {"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
+  ],
+  "tools":[
+    {"name":"dmidecode","ok":true},
+    {"name":"nvidia-smi","ok":false}
+  ],
+  "services":[
+    {"name":"bee-web","status":"active"},
+    {"name":"bee-nvidia","status":"inactive"}
+  ]
+}`
+	if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
+		t.Fatal(err)
+	}
+	componentStatus := `[
+  {
+    "component_key":"cpu:all",
+    "status":"Warning",
+    "error_summary":"cpu SAT: FAILED",
+    "history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
+  },
+  {
+    "component_key":"memory:all",
+    "status":"OK",
+    "history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
+  },
+  {
+    "component_key":"storage:nvme0n1",
+    "status":"Critical",
+    "error_summary":"storage SAT: FAILED",
+    "history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
+  },
+  {
+    "component_key":"pcie:gpu:nvidia",
+    "status":"Warning",
+    "error_summary":"nvidia SAT: FAILED",
+    "history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
+  }
+]`
+	if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`Runtime Health`,
+		`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
+		`Export Directory`,
+		`Network`,
+		`NVIDIA/AMD Driver`,
+		`CUDA / ROCm`,
+		`Required Utilities`,
+		`Bee Services`,
+		`<td>CPU</td>`,
+		`<td>Memory</td>`,
+		`<td>Storage</td>`,
+		`<td>GPU</td>`,
+		`CUDA runtime is not ready for GPU SAT.`,
+		`Missing: nvidia-smi`,
+		`bee-nvidia=inactive`,
+		`cpu SAT: FAILED`,
+		`storage SAT: FAILED`,
+		`sat:nvidia`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("dashboard missing %q: %s", needle, body)
+		}
+	}
+}
--- a/audit/internal/webui/task_page.go
+++ b/audit/internal/webui/task_page.go
@@ -0,0 +1,267 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"net/http"
+	"os"
+	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
+	id := r.PathValue("id")
+	task, ok := globalQueue.findByID(id)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	snapshot := *task
+	body := renderTaskDetailPage(h.opts, snapshot)
+	w.Header().Set("Cache-Control", "no-store")
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	_, _ = w.Write([]byte(body))
+}
+
+func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
+	task, samples, _, _, ok := h.taskSamplesForRequest(r)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	type taskChartIndexEntry struct {
+		Title string `json:"title"`
+		File  string `json:"file"`
+	}
+	entries := make([]taskChartIndexEntry, 0)
+	for _, spec := range taskChartSpecsForSamples(samples) {
+		title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
+		if !ok {
+			continue
+		}
+		entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
+	}
+	w.Header().Set("Cache-Control", "no-store")
+	w.Header().Set("Content-Type", "application/json; charset=utf-8")
+	_ = json.NewEncoder(w).Encode(entries)
+}
+
+func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
+	task, samples, _, _, ok := h.taskSamplesForRequest(r)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
+	path, ok := taskChartPathFromFile(file)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
+	if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
+		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
+		return
+	}
+	w.Header().Set("Content-Type", "image/svg+xml")
+	w.Header().Set("Cache-Control", "no-store")
+	_, _ = w.Write(buf)
+}
+
+func renderTaskDetailPage(opts HandlerOptions, task Task) string {
+	title := task.Name
+	if strings.TrimSpace(title) == "" {
+		title = task.ID
+	}
+	var body strings.Builder
+	body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
+	body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
+	if task.Status == TaskRunning || task.Status == TaskPending {
+		body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
+	}
+	body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
+	body.WriteString(`</div>`)
+
+	if report := loadTaskReportFragment(task); report != "" {
+		body.WriteString(report)
+	} else {
+		body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
+		body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
+		body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
+		if strings.TrimSpace(task.ErrMsg) != "" {
+			body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
+		}
+		body.WriteString(`</div></div>`)
+	}
+
+	if task.Status == TaskRunning {
+		body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
+		body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
+		body.WriteString(`</div></div>`)
+	}
+
+	if task.Status == TaskRunning || task.Status == TaskPending {
+		body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
+		body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
+		body.WriteString(`</div></div>`)
+		body.WriteString(`<script>
+function cancelTaskDetail(id) {
+  fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
+    var term = document.getElementById('task-live-log');
+    if (term) {
+      term.textContent += '\nCancel requested.\n';
+      term.scrollTop = term.scrollHeight;
+    }
+  });
+}
+function renderTaskLiveCharts(taskId, charts) {
+  const host = document.getElementById('task-live-charts');
+  if (!host) return;
+  if (!Array.isArray(charts) || charts.length === 0) {
+    host.innerHTML = 'Waiting for metric samples...';
+    return;
+  }
+  const seen = {};
+  charts.forEach(function(chart) {
+    seen[chart.file] = true;
+    let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
+    if (img) {
+      const card = img.closest('.card');
+      if (card) {
+        const title = card.querySelector('.card-head');
+        if (title) title.textContent = chart.title;
+      }
+      return;
+    }
+    const card = document.createElement('div');
+    card.className = 'card';
+    card.style.margin = '0';
+    card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
+    card.querySelector('.card-head').textContent = chart.title;
+    const body = card.querySelector('.card-body');
+    img = document.createElement('img');
+    img.setAttribute('data-task-chart', '1');
+    img.setAttribute('data-chart-file', chart.file);
+    img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
+    img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
+    img.style.width = '100%';
+    img.style.display = 'block';
+    img.style.borderRadius = '6px';
+    img.alt = chart.title;
+    body.appendChild(img);
+    host.appendChild(card);
+  });
+  Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
+    const file = img.getAttribute('data-chart-file') || '';
+    if (seen[file]) return;
+    const card = img.closest('.card');
+    if (card) card.remove();
+  });
+}
+function loadTaskLiveCharts(taskId) {
+  fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
+    renderTaskLiveCharts(taskId, charts);
+  }).catch(function(){
+    const host = document.getElementById('task-live-charts');
+    if (host) host.innerHTML = 'Task charts are unavailable.';
+  });
+}
+function refreshTaskLiveCharts() {
+  document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
+    const base = img.dataset.baseSrc;
+    if (!base) return;
+    img.src = base + '?t=' + Date.now();
+  });
+}
+var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
+var _taskDetailTerm = document.getElementById('task-live-log');
+var _taskChartTimer = null;
+var _taskChartsFrozen = false;
+_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
+_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
+_taskDetailES.addEventListener('done', function(e){
+  if (_taskChartTimer) clearInterval(_taskChartTimer);
+  _taskDetailES.close();
+  _taskDetailES = null;
+  _taskChartsFrozen = true;
+  _taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
+  _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
+  refreshTaskLiveCharts();
+});
+_taskDetailES.onerror = function(){
+  if (_taskChartTimer) clearInterval(_taskChartTimer);
+  if (_taskDetailES) {
+    _taskDetailES.close();
+    _taskDetailES = null;
+  }
+};
+loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
+_taskChartTimer = setInterval(function(){
+  if (_taskChartsFrozen) return;
+  loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
+  refreshTaskLiveCharts();
+}, 2000);
+</script>`)
+	}
+
+	return layoutHead(opts.Title+" — "+title) +
+		layoutNav("tasks", opts.BuildLabel) +
+		`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
+		body.String() +
+		`</div></div></body></html>`
+}
+
+func loadTaskReportFragment(task Task) string {
+	if strings.TrimSpace(task.ReportHTMLPath) == "" {
+		return ""
+	}
+	data, err := os.ReadFile(task.ReportHTMLPath)
+	if err != nil || len(data) == 0 {
+		return ""
+	}
+	return string(data)
+}
+
+func taskArtifactDownloadLink(task Task, absPath string) string {
+	if strings.TrimSpace(absPath) == "" {
+		return ""
+	}
+	return fmt.Sprintf(`/export/file?path=%s`, absPath)
+}
+
+func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
+	id := r.PathValue("id")
+	taskPtr, ok := globalQueue.findByID(id)
+	if !ok {
+		return Task{}, nil, time.Time{}, time.Time{}, false
+	}
+	task := *taskPtr
+	start, end := taskTimeWindow(&task)
+	samples, err := loadTaskMetricSamples(start, end)
+	if err != nil {
+		return task, nil, start, end, true
+	}
+	return task, samples, start, end, true
+}
+
+func taskTimelineForTask(task Task) []chartTimelineSegment {
+	start, end := taskTimeWindow(&task)
+	return []chartTimelineSegment{{Start: start, End: end, Active: true}}
+}
+
+func taskChartPathFromFile(file string) (string, bool) {
+	file = strings.TrimSpace(file)
+	for _, spec := range taskDashboardChartSpecs {
+		if spec.File == file {
+			return spec.Path, true
+		}
+	}
+	if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
+		id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
+		return "gpu/" + id + "-overview", true
+	}
+	return "", false
+}
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -0,0 +1,343 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+var taskReportMetricsDBPath = metricsDBPath
+
+type taskReport struct {
+	ID          string            `json:"id"`
+	Name        string            `json:"name"`
+	Target      string            `json:"target"`
+	Status      string            `json:"status"`
+	CreatedAt   time.Time         `json:"created_at"`
+	StartedAt   *time.Time        `json:"started_at,omitempty"`
+	DoneAt      *time.Time        `json:"done_at,omitempty"`
+	DurationSec int               `json:"duration_sec,omitempty"`
+	Error       string            `json:"error,omitempty"`
+	LogFile     string            `json:"log_file,omitempty"`
+	Charts      []taskReportChart `json:"charts,omitempty"`
+	GeneratedAt time.Time         `json:"generated_at"`
+}
+
+type taskReportChart struct {
+	Title string `json:"title"`
+	File  string `json:"file"`
+}
+
+type taskChartSpec struct {
+	Path string
+	File string
+}
+
+var taskDashboardChartSpecs = []taskChartSpec{
+	{Path: "server-load", File: "server-load.svg"},
+	{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
+	{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
+	{Path: "server-power", File: "server-power.svg"},
+	{Path: "server-fans", File: "server-fans.svg"},
+	{Path: "gpu-all-load", File: "gpu-all-load.svg"},
+	{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
+	{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
+	{Path: "gpu-all-power", File: "gpu-all-power.svg"},
+	{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
+}
+
+func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
+	specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
+	specs = append(specs, taskDashboardChartSpecs...)
+	for _, idx := range taskGPUIndices(samples) {
+		specs = append(specs, taskChartSpec{
+			Path: fmt.Sprintf("gpu/%d-overview", idx),
+			File: fmt.Sprintf("gpu-%d-overview.svg", idx),
+		})
+	}
+	return specs
+}
+
+func writeTaskReportArtifacts(t *Task) error {
+	if t == nil {
+		return nil
+	}
+	ensureTaskReportPaths(t)
+	if strings.TrimSpace(t.ArtifactsDir) == "" {
+		return nil
+	}
+	if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
+		return err
+	}
+
+	start, end := taskTimeWindow(t)
+	samples, _ := loadTaskMetricSamples(start, end)
+	charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
+
+	logText := ""
+	if data, err := os.ReadFile(t.LogPath); err == nil {
+		logText = string(data)
+	}
+
+	report := taskReport{
+		ID:          t.ID,
+		Name:        t.Name,
+		Target:      t.Target,
+		Status:      t.Status,
+		CreatedAt:   t.CreatedAt,
+		StartedAt:   t.StartedAt,
+		DoneAt:      t.DoneAt,
+		DurationSec: taskElapsedSec(t, reportDoneTime(t)),
+		Error:       t.ErrMsg,
+		LogFile:     filepath.Base(t.LogPath),
+		Charts:      charts,
+		GeneratedAt: time.Now().UTC(),
+	}
+	if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
+		return err
+	}
+	return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
+}
+
+func reportDoneTime(t *Task) time.Time {
+	if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
+		return *t.DoneAt
+	}
+	return time.Now()
+}
+
+func taskTimeWindow(t *Task) (time.Time, time.Time) {
+	if t == nil {
+		now := time.Now().UTC()
+		return now, now
+	}
+	start := t.CreatedAt.UTC()
+	if t.StartedAt != nil && !t.StartedAt.IsZero() {
+		start = t.StartedAt.UTC()
+	}
+	end := time.Now().UTC()
+	if t.DoneAt != nil && !t.DoneAt.IsZero() {
+		end = t.DoneAt.UTC()
+	}
+	if end.Before(start) {
+		end = start
+	}
+	return start, end
+}
+
+func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
+	db, err := openMetricsDB(taskReportMetricsDBPath)
+	if err != nil {
+		return nil, err
+	}
+	defer db.Close()
+	return db.LoadBetween(start, end)
+}
+
+func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
+	if len(samples) == 0 {
+		return nil, nil
+	}
+	timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
+	var charts []taskReportChart
+	inline := make(map[string]string)
+	for _, spec := range taskChartSpecsForSamples(samples) {
+		title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
+		if !ok || len(svg) == 0 {
+			continue
+		}
+		path := filepath.Join(dir, spec.File)
+		if err := os.WriteFile(path, svg, 0644); err != nil {
+			continue
+		}
+		charts = append(charts, taskReportChart{Title: title, File: spec.File})
+		inline[spec.File] = string(svg)
+	}
+	return charts, inline
+}
+
+func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
+	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
+		buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		if err != nil || !hasData {
+			return "", nil, false
+		}
+		return gpuDisplayLabel(idx) + " Overview", buf, true
+	}
+	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	if !ok {
+		return "", nil, false
+	}
+	buf, err := renderMetricChartSVG(
+		title,
+		labels,
+		sampleTimes(samples),
+		datasets,
+		names,
+		yMin,
+		yMax,
+		chartCanvasHeightForPath(path, len(names)),
+		timeline,
+	)
+	if err != nil {
+		return "", nil, false
+	}
+	return title, buf, true
+}
+
+func taskGPUIndices(samples []platform.LiveMetricSample) []int {
+	seen := map[int]bool{}
+	var out []int
+	for _, s := range samples {
+		for _, g := range s.GPUs {
+			if seen[g.GPUIndex] {
+				continue
+			}
+			seen[g.GPUIndex] = true
+			out = append(out, g.GPUIndex)
+		}
+	}
+	sort.Ints(out)
+	return out
+}
+
+func writeJSONFile(path string, v any) error {
+	data, err := json.MarshalIndent(v, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0644)
+}
+
+func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
+	var b strings.Builder
+	b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
+	b.WriteString(`<div class="grid2">`)
+	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
+	b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
+	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
+	if strings.TrimSpace(report.Error) != "" {
+		b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
+	}
+	b.WriteString(`</div></div>`)
+	b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
+	b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
+	b.WriteString(`</div></div></div>`)
+	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
+		b.WriteString(benchmarkCard)
+	}
+
+	if len(report.Charts) > 0 {
+		for _, chart := range report.Charts {
+			b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
+			b.WriteString(charts[chart.File])
+			b.WriteString(`</div></div>`)
+		}
+	} else {
+		b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
+	}
+
+	b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
+	b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
+	b.WriteString(`</div></div>`)
+	return b.String()
+}
+
+func renderTaskBenchmarkResultsCard(target, logText string) string {
+	if strings.TrimSpace(target) != "nvidia-benchmark" {
+		return ""
+	}
+	resultPath := taskBenchmarkResultPath(logText)
+	if strings.TrimSpace(resultPath) == "" {
+		return ""
+	}
+	columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
+	if len(runs) == 0 {
+		return ""
+	}
+	return renderBenchmarkResultsCardFromRuns(
+		"Benchmark Results",
+		"Composite score for this benchmark task.",
+		"No benchmark results were saved for this task.",
+		columns,
+		runs,
+	)
+}
+
+func taskBenchmarkResultPath(logText string) string {
+	archivePath := taskArchivePathFromLog(logText)
+	if archivePath == "" {
+		return ""
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	if runDir == archivePath {
+		return ""
+	}
+	return filepath.Join(runDir, "result.json")
+}
+
+func taskArchivePathFromLog(logText string) string {
+	lines := strings.Split(logText, "\n")
+	for i := len(lines) - 1; i >= 0; i-- {
+		line := strings.TrimSpace(lines[i])
+		if line == "" || !strings.HasPrefix(line, "Archive:") {
+			continue
+		}
+		path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
+		if strings.HasPrefix(path, "Archive written to ") {
+			path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
+		}
+		if strings.HasSuffix(path, ".tar.gz") {
+			return path
+		}
+	}
+	return ""
+}
+
+func renderTaskStatusBadge(status string) string {
+	className := map[string]string{
+		TaskRunning:   "badge-ok",
+		TaskPending:   "badge-unknown",
+		TaskDone:      "badge-ok",
+		TaskFailed:    "badge-err",
+		TaskCancelled: "badge-unknown",
+	}[status]
+	if className == "" {
+		className = "badge-unknown"
+	}
+	label := strings.TrimSpace(status)
+	if label == "" {
+		label = "unknown"
+	}
+	return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
+}
+
+func formatTaskTime(ts *time.Time, fallback time.Time) string {
+	if ts != nil && !ts.IsZero() {
+		return ts.Local().Format("2006-01-02 15:04:05")
+	}
+	if !fallback.IsZero() {
+		return fallback.Local().Format("2006-01-02 15:04:05")
+	}
+	return "n/a"
+}
+
+func formatTaskDuration(sec int) string {
+	if sec <= 0 {
+		return "n/a"
+	}
+	if sec < 60 {
+		return fmt.Sprintf("%ds", sec)
+	}
+	if sec < 3600 {
+		return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
+	}
+	return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
+}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -39,6 +39,7 @@ var taskNames = map[string]string{
 	"nvidia-interconnect":    "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
 	"nvidia-bandwidth":       "NVIDIA Bandwidth Test (NVBandwidth)",
 	"nvidia-stress":          "NVIDIA GPU Stress",
+	"hpl":                    "LINPACK (HPL)",
 	"memory":                 "Memory SAT",
 	"storage":                "Storage SAT",
 	"cpu":                    "CPU SAT",
@@ -92,17 +93,20 @@ func taskDisplayName(target, profile, loader string) string {

 // Task represents one unit of work in the queue.
 type Task struct {
-	ID         string     `json:"id"`
-	Name       string     `json:"name"`
-	Target     string     `json:"target"`
-	Priority   int        `json:"priority"`
-	Status     string     `json:"status"`
-	CreatedAt  time.Time  `json:"created_at"`
-	StartedAt  *time.Time `json:"started_at,omitempty"`
-	DoneAt     *time.Time `json:"done_at,omitempty"`
-	ElapsedSec int        `json:"elapsed_sec,omitempty"`
-	ErrMsg     string     `json:"error,omitempty"`
-	LogPath    string     `json:"log_path,omitempty"`
+	ID             string     `json:"id"`
+	Name           string     `json:"name"`
+	Target         string     `json:"target"`
+	Priority       int        `json:"priority"`
+	Status         string     `json:"status"`
+	CreatedAt      time.Time  `json:"created_at"`
+	StartedAt      *time.Time `json:"started_at,omitempty"`
+	DoneAt         *time.Time `json:"done_at,omitempty"`
+	ElapsedSec     int        `json:"elapsed_sec,omitempty"`
+	ErrMsg         string     `json:"error,omitempty"`
+	LogPath        string     `json:"log_path,omitempty"`
+	ArtifactsDir   string     `json:"artifacts_dir,omitempty"`
+	ReportJSONPath string     `json:"report_json_path,omitempty"`
+	ReportHTMLPath string     `json:"report_html_path,omitempty"`

 	// runtime fields (not serialised)
 	job    *jobState
@@ -112,31 +116,36 @@ type Task struct {
 // taskParams holds optional parameters parsed from the run request.
 type taskParams struct {
 	Duration           int      `json:"duration,omitempty"`
-	DiagLevel          int      `json:"diag_level,omitempty"`
+	StressMode         bool     `json:"stress_mode,omitempty"`
 	GPUIndices         []int    `json:"gpu_indices,omitempty"`
 	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
 	SizeMB             int      `json:"size_mb,omitempty"`
+	Passes             int      `json:"passes,omitempty"`
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
+	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
 }

 type persistedTask struct {
-	ID        string     `json:"id"`
-	Name      string     `json:"name"`
-	Target    string     `json:"target"`
-	Priority  int        `json:"priority"`
-	Status    string     `json:"status"`
-	CreatedAt time.Time  `json:"created_at"`
-	StartedAt *time.Time `json:"started_at,omitempty"`
-	DoneAt    *time.Time `json:"done_at,omitempty"`
-	ErrMsg    string     `json:"error,omitempty"`
-	LogPath   string     `json:"log_path,omitempty"`
-	Params    taskParams `json:"params,omitempty"`
+	ID             string     `json:"id"`
+	Name           string     `json:"name"`
+	Target         string     `json:"target"`
+	Priority       int        `json:"priority"`
+	Status         string     `json:"status"`
+	CreatedAt      time.Time  `json:"created_at"`
+	StartedAt      *time.Time `json:"started_at,omitempty"`
+	DoneAt         *time.Time `json:"done_at,omitempty"`
+	ErrMsg         string     `json:"error,omitempty"`
+	LogPath        string     `json:"log_path,omitempty"`
+	ArtifactsDir   string     `json:"artifacts_dir,omitempty"`
+	ReportJSONPath string     `json:"report_json_path,omitempty"`
+	ReportHTMLPath string     `json:"report_html_path,omitempty"`
+	Params         taskParams `json:"params,omitempty"`
 }

 type burnPreset struct {
@@ -208,11 +217,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
 const maxTaskHistory = 50

 var (
-	runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-		return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
+	runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
+		return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc)
 	}
-	runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-		return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
+	runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
+		return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
 	}
 	runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 		return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
@@ -252,6 +261,7 @@ func (q *taskQueue) enqueue(t *Task) {
 	q.prune()
 	q.persistLocked()
 	q.mu.Unlock()
+	taskSerialEvent(t, "queued")
 	select {
 	case q.trigger <- struct{}{}:
 	default:
@@ -416,44 +426,30 @@ func (q *taskQueue) worker() {
 			setCPUGovernor("performance")
 			defer setCPUGovernor("powersave")

-			// Drain all pending tasks and start them in parallel.
-			q.mu.Lock()
-			var batch []*Task
 			for {
+				q.mu.Lock()
 				t := q.nextPending()
 				if t == nil {
-					break
+					q.prune()
+					q.persistLocked()
+					q.mu.Unlock()
+					return
 				}
 				now := time.Now()
 				t.Status = TaskRunning
 				t.StartedAt = &now
 				t.DoneAt = nil
 				t.ErrMsg = ""
-				j := newTaskJobState(t.LogPath)
+				j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
 				t.job = j
-				batch = append(batch, t)
-			}
-			if len(batch) > 0 {
 				q.persistLocked()
-			}
-			q.mu.Unlock()
+				q.mu.Unlock()

-			var wg sync.WaitGroup
-			for _, t := range batch {
-				t := t
-				j := t.job
 				taskCtx, taskCancel := context.WithCancel(context.Background())
 				j.cancel = taskCancel
-				wg.Add(1)
-				goRecoverOnce("task "+t.Target, func() {
-					defer wg.Done()
-					defer taskCancel()
-					q.executeTask(t, j, taskCtx)
-				})
-			}
-			wg.Wait()
+				q.executeTask(t, j, taskCtx)
+				taskCancel()

-			if len(batch) > 0 {
 				q.mu.Lock()
 				q.prune()
 				q.persistLocked()
@@ -496,8 +492,6 @@ func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {

 func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 	q.mu.Lock()
-	defer q.mu.Unlock()
-
 	now := time.Now()
 	t.DoneAt = &now
 	if t.Status == TaskRunning {
@@ -509,7 +503,18 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 			t.ErrMsg = ""
 		}
 	}
+	q.finalizeTaskArtifactPathsLocked(t)
 	q.persistLocked()
+	q.mu.Unlock()
+
+	if err := writeTaskReportArtifacts(t); err != nil {
+		appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
+	}
+	if t.ErrMsg != "" {
+		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
+		return
+	}
+	taskSerialEvent(t, "finished with status="+t.Status)
 }

 // setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
@@ -549,7 +554,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		diagLevel := t.params.DiagLevel
+		diagLevel := 2
+		if t.params.StressMode {
+			diagLevel = 3
+		}
 		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
 			result, e := a.RunNvidiaAcceptancePackWithOptions(
 				ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -583,6 +591,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RunNCCL:           t.params.RunNCCL,
+			ParallelGPUs:      t.params.ParallelGPUs,
 		}, j.append)
 	case "nvidia-compute":
 		if a == nil {
@@ -654,13 +663,17 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
+		sizeMB, passes := 256, 1
+		if t.params.StressMode {
+			sizeMB, passes = 1024, 3
+		}
+		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
+		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
 	case "cpu":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -671,7 +684,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		if dur <= 0 {
-			dur = 60
+			if t.params.StressMode {
+				dur = 1800
+			} else {
+				dur = 60
+			}
 		}
 		j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
 		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
@@ -723,6 +740,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
+	case "hpl":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		opts := platform.HPLOptions{
+			MemFraction: 0.80,
+			NB:          256,
+		}
+		archive, err = func() (string, error) {
+			path, _, runErr := a.RunHPL(ctx, "", opts, j.append)
+			return path, runErr
+		}()
 	case "platform-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -848,6 +878,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		now := time.Now()
 		t.DoneAt = &now
 		globalQueue.persistLocked()
+		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	case TaskRunning:
 		if t.job != nil {
@@ -857,6 +888,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		now := time.Now()
 		t.DoneAt = &now
 		globalQueue.persistLocked()
+		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	default:
 		writeError(w, http.StatusConflict, "task is not running or pending")
@@ -897,6 +929,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 		case TaskPending:
 			t.Status = TaskCancelled
 			t.DoneAt = &now
+			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		case TaskRunning:
 			if t.job != nil {
@@ -904,6 +937,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
+			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		}
 	}
@@ -922,6 +956,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 		case TaskPending:
 			t.Status = TaskCancelled
 			t.DoneAt = &now
+			taskSerialEvent(t, "finished with status="+t.Status)
 			cancelled++
 		case TaskRunning:
 			if t.job != nil {
@@ -929,6 +964,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
+			taskSerialEvent(t, "finished with status="+t.Status)
 			cancelled++
 		}
 	}
@@ -992,10 +1028,10 @@ func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
 }

 func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
-	if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
+	if q.logsDir == "" || t.ID == "" {
 		return
 	}
-	t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
+	q.ensureTaskArtifactPathsLocked(t)
 }

 func (q *taskQueue) loadLocked() {
@@ -1012,17 +1048,20 @@ func (q *taskQueue) loadLocked() {
 	}
 	for _, pt := range persisted {
 		t := &Task{
-			ID:        pt.ID,
-			Name:      pt.Name,
-			Target:    pt.Target,
-			Priority:  pt.Priority,
-			Status:    pt.Status,
-			CreatedAt: pt.CreatedAt,
-			StartedAt: pt.StartedAt,
-			DoneAt:    pt.DoneAt,
-			ErrMsg:    pt.ErrMsg,
-			LogPath:   pt.LogPath,
-			params:    pt.Params,
+			ID:             pt.ID,
+			Name:           pt.Name,
+			Target:         pt.Target,
+			Priority:       pt.Priority,
+			Status:         pt.Status,
+			CreatedAt:      pt.CreatedAt,
+			StartedAt:      pt.StartedAt,
+			DoneAt:         pt.DoneAt,
+			ErrMsg:         pt.ErrMsg,
+			LogPath:        pt.LogPath,
+			ArtifactsDir:   pt.ArtifactsDir,
+			ReportJSONPath: pt.ReportJSONPath,
+			ReportHTMLPath: pt.ReportHTMLPath,
+			params:         pt.Params,
 		}
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
@@ -1053,17 +1092,20 @@ func (q *taskQueue) persistLocked() {
 	state := make([]persistedTask, 0, len(q.tasks))
 	for _, t := range q.tasks {
 		state = append(state, persistedTask{
-			ID:        t.ID,
-			Name:      t.Name,
-			Target:    t.Target,
-			Priority:  t.Priority,
-			Status:    t.Status,
-			CreatedAt: t.CreatedAt,
-			StartedAt: t.StartedAt,
-			DoneAt:    t.DoneAt,
-			ErrMsg:    t.ErrMsg,
-			LogPath:   t.LogPath,
-			Params:    t.params,
+			ID:             t.ID,
+			Name:           t.Name,
+			Target:         t.Target,
+			Priority:       t.Priority,
+			Status:         t.Status,
+			CreatedAt:      t.CreatedAt,
+			StartedAt:      t.StartedAt,
+			DoneAt:         t.DoneAt,
+			ErrMsg:         t.ErrMsg,
+			LogPath:        t.LogPath,
+			ArtifactsDir:   t.ArtifactsDir,
+			ReportJSONPath: t.ReportJSONPath,
+			ReportHTMLPath: t.ReportHTMLPath,
+			Params:         t.params,
 		})
 	}
 	data, err := json.MarshalIndent(state, "", "  ")
@@ -1094,3 +1136,113 @@ func taskElapsedSec(t *Task, now time.Time) int {
 	}
 	return int(end.Sub(start).Round(time.Second) / time.Second)
 }
+
+func taskFolderStatus(status string) string {
+	status = strings.TrimSpace(strings.ToLower(status))
+	switch status {
+	case TaskRunning, TaskDone, TaskFailed, TaskCancelled:
+		return status
+	default:
+		return TaskPending
+	}
+}
+
+func sanitizeTaskFolderPart(s string) string {
+	s = strings.TrimSpace(strings.ToLower(s))
+	if s == "" {
+		return "task"
+	}
+	var b strings.Builder
+	lastDash := false
+	for _, r := range s {
+		isAlnum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
+		if isAlnum {
+			b.WriteRune(r)
+			lastDash = false
+			continue
+		}
+		if !lastDash {
+			b.WriteByte('-')
+			lastDash = true
+		}
+	}
+	out := strings.Trim(b.String(), "-")
+	if out == "" {
+		return "task"
+	}
+	return out
+}
+
+func taskArtifactsDir(root string, t *Task, status string) string {
+	if strings.TrimSpace(root) == "" || t == nil {
+		return ""
+	}
+	prefix := taskFolderNumberPrefix(t.ID)
+	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
+}
+
+func taskFolderNumberPrefix(taskID string) string {
+	taskID = strings.TrimSpace(taskID)
+	if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
+		num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
+		if len(num) == 3 {
+			allDigits := true
+			for _, r := range num {
+				if r < '0' || r > '9' {
+					allDigits = false
+					break
+				}
+			}
+			if allDigits {
+				return num
+			}
+		}
+	}
+	fallback := sanitizeTaskFolderPart(taskID)
+	if fallback == "" {
+		return "000"
+	}
+	return fallback
+}
+
+func ensureTaskReportPaths(t *Task) {
+	if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
+		return
+	}
+	if t.LogPath == "" || filepath.Base(t.LogPath) == "task.log" {
+		t.LogPath = filepath.Join(t.ArtifactsDir, "task.log")
+	}
+	t.ReportJSONPath = filepath.Join(t.ArtifactsDir, "report.json")
+	t.ReportHTMLPath = filepath.Join(t.ArtifactsDir, "report.html")
+}
+
+func (q *taskQueue) ensureTaskArtifactPathsLocked(t *Task) {
+	if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
+		return
+	}
+	if strings.TrimSpace(t.ArtifactsDir) == "" {
+		t.ArtifactsDir = taskArtifactsDir(q.logsDir, t, t.Status)
+	}
+	if t.ArtifactsDir != "" {
+		_ = os.MkdirAll(t.ArtifactsDir, 0755)
+	}
+	ensureTaskReportPaths(t)
+}
+
+func (q *taskQueue) finalizeTaskArtifactPathsLocked(t *Task) {
+	if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
+		return
+	}
+	q.ensureTaskArtifactPathsLocked(t)
+	dstDir := taskArtifactsDir(q.logsDir, t, t.Status)
+	if dstDir == "" {
+		return
+	}
+	if t.ArtifactsDir != "" && t.ArtifactsDir != dstDir {
+		if _, err := os.Stat(dstDir); err != nil {
+			_ = os.Rename(t.ArtifactsDir, dstDir)
+		}
+		t.ArtifactsDir = dstDir
+	}
+	ensureTaskReportPaths(t)
+}
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -2,6 +2,7 @@ package webui

 import (
 	"context"
+	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -12,6 +13,7 @@ import (
 	"time"

 	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
 )

 func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
@@ -161,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
 	}
 }

+func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	origCounter := jobCounter.Load()
+	jobCounter.Store(0)
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+		jobCounter.Store(origCounter)
+	})
+
+	if got := newJobID("ignored"); got != "TASK-000" {
+		t.Fatalf("id=%q want TASK-000", got)
+	}
+	if got := newJobID("ignored"); got != "TASK-001" {
+		t.Fatalf("id=%q want TASK-001", got)
+	}
+}
+
+func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
+	root := t.TempDir()
+	task := &Task{
+		ID:   "TASK-007",
+		Name: "NVIDIA Benchmark",
+	}
+	got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
+	if !strings.HasPrefix(got, "007_") {
+		t.Fatalf("artifacts dir=%q want prefix 007_", got)
+	}
+}
+
 func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
 	dir := t.TempDir()
 	logPath := filepath.Join(dir, "task.log")
@@ -248,6 +284,196 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
 	t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
 }

+func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	db, err := openMetricsDB(metricsPath)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	base := time.Now().UTC().Add(-45 * time.Second)
+	if err := db.Write(platform.LiveMetricSample{
+		Timestamp:  base,
+		CPULoadPct: 42,
+		MemLoadPct: 35,
+		PowerW:     510,
+	}); err != nil {
+		t.Fatalf("Write: %v", err)
+	}
+	_ = db.Close()
+
+	q := &taskQueue{
+		statePath: filepath.Join(dir, "tasks-state.json"),
+		logsDir:   filepath.Join(dir, "tasks"),
+		trigger:   make(chan struct{}, 1),
+	}
+	if err := os.MkdirAll(q.logsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	started := time.Now().UTC().Add(-90 * time.Second)
+	task := &Task{
+		ID:        "task-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: started.Add(-10 * time.Second),
+		StartedAt: &started,
+	}
+	q.assignTaskLogPathLocked(task)
+	appendJobLog(task.LogPath, "line-1")
+
+	job := newTaskJobState(task.LogPath)
+	job.finish("")
+	q.finalizeTaskRun(task, job)
+
+	if task.Status != TaskDone {
+		t.Fatalf("status=%q want %q", task.Status, TaskDone)
+	}
+	if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
+		t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
+	}
+	if _, err := os.Stat(task.ReportJSONPath); err != nil {
+		t.Fatalf("report json: %v", err)
+	}
+	if _, err := os.Stat(task.ReportHTMLPath); err != nil {
+		t.Fatalf("report html: %v", err)
+	}
+	var report taskReport
+	data, err := os.ReadFile(task.ReportJSONPath)
+	if err != nil {
+		t.Fatalf("ReadFile(report.json): %v", err)
+	}
+	if err := json.Unmarshal(data, &report); err != nil {
+		t.Fatalf("Unmarshal(report.json): %v", err)
+	}
+	if report.ID != task.ID || report.Status != TaskDone {
+		t.Fatalf("report=%+v", report)
+	}
+	if len(report.Charts) == 0 {
+		t.Fatalf("expected charts in report, got none")
+	}
+}
+
+func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	result := platform.NvidiaBenchmarkResult{
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
+		BenchmarkProfile: "standard",
+		OverallStatus:    "OK",
+		GPUs: []platform.BenchmarkGPUResult{
+			{
+				Index: 0,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1176.25,
+				},
+			},
+		},
+	}
+	raw, err := json.Marshal(result)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
+	if err := os.MkdirAll(artifactsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	task := &Task{
+		ID:           "task-bench",
+		Name:         "NVIDIA Benchmark",
+		Target:       "nvidia-benchmark",
+		Status:       TaskDone,
+		CreatedAt:    time.Now().UTC().Add(-time.Minute),
+		ArtifactsDir: artifactsDir,
+	}
+	ensureTaskReportPaths(task)
+	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
+	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := writeTaskReportArtifacts(task); err != nil {
+		t.Fatalf("writeTaskReportArtifacts: %v", err)
+	}
+
+	body, err := os.ReadFile(task.ReportHTMLPath)
+	if err != nil {
+		t.Fatalf("ReadFile(report.html): %v", err)
+	}
+	html := string(body)
+	for _, needle := range []string{
+		`Benchmark Results`,
+		`Composite score for this benchmark task.`,
+		`GPU #0 — NVIDIA H100 PCIe`,
+		`1176.25`,
+	} {
+		if !strings.Contains(html, needle) {
+			t.Fatalf("report missing %q: %s", needle, html)
+		}
+	}
+}
+
+func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
+	var lines []string
+	prev := taskSerialWriteLine
+	taskSerialWriteLine = func(line string) { lines = append(lines, line) }
+	t.Cleanup(func() { taskSerialWriteLine = prev })
+
+	dir := t.TempDir()
+	q := &taskQueue{
+		statePath: filepath.Join(dir, "tasks-state.json"),
+		logsDir:   filepath.Join(dir, "tasks"),
+		trigger:   make(chan struct{}, 1),
+	}
+	task := &Task{
+		ID:        "task-serial-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskPending,
+		CreatedAt: time.Now().UTC(),
+	}
+
+	q.enqueue(task)
+	started := time.Now().UTC()
+	task.Status = TaskRunning
+	task.StartedAt = &started
+	job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
+	job.append("Starting CPU SAT...")
+	job.append("CPU stress duration: 60s")
+	job.finish("")
+	q.finalizeTaskRun(task, job)
+
+	joined := strings.Join(lines, "\n")
+	for _, needle := range []string{
+		"queued",
+		"Starting CPU SAT...",
+		"CPU stress duration: 60s",
+		"finished with status=done",
+	} {
+		if !strings.Contains(joined, needle) {
+			t.Fatalf("serial mirror missing %q in %q", needle, joined)
+		}
+	}
+}
+
 func TestResolveBurnPreset(t *testing.T) {
 	tests := []struct {
 		profile string
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -0,0 +1,248 @@
+# Benchmark clock calibration research
+
+## Status
+In progress. Baseline data from production servers pending.
+
+## Background
+
+The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
+before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
+`avg_steady_clock < locked_target * 0.90`.
+
+Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
+even a healthy GPU in a non-ideal server will sustain clocks well below boost.
+The 90% threshold has no empirical basis.
+
+## Key observations (2026-04-06)
+
+### H100 PCIe — new card, server not designed for it
+- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
+- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
+- Stability: 70.0 — clocks erratic, no equilibrium found
+- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
+
+### H200 NVL — new card, server not designed for it
+- avg clock = P95 = 1635 MHz (perfectly stable)
+- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
+- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
+- Degradation: power_capped, thermal_limited
+- Compute: 989 TOPS — card is computing correctly for its frequency
+
+### Key insight
+The meaningful distinction is not *whether* the card throttles but *how stably*
+it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
+H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
+instability may reflect a more severe thermal mismatch or a card issue.
+
+`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
+`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
+
+## Hypothesis for baseline
+
+After testing on servers designed for their GPUs (proper cooling):
+- Healthy GPU under sustained load will run at a stable fraction of boost
+- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
+- Base clock (`clocks.base.gr`) may be a better reference than boost:
+  a healthy card under real workload should comfortably exceed base clock
+
+## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
+
+Source: external stress test tool, ~90s runs, designed server, adequate power.
+
+### Healthy fingerprint
+
+- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
+- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
+  - Avg steady (visual): **~1580–1620 MHz**
+  - vs boost 1755 MHz: **~91–92%**
+  - Oscillation is NORMAL — this is the boost algorithm balancing under power cap
+  - Stable power + oscillating clocks = healthy power-cap behavior
+- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
+- **Consistency**: all 10 samples within ±20 MHz — very repeatable
+
+### Characteristic patten
+Flat power line + oscillating/declining clock line = GPU correctly managed by
+power cap algorithm. Do NOT flag this as instability.
+
+### Clock CV implication
+The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
+The current `variance_too_high` threshold (StabilityScore < 85) may fire on
+healthy HBM2e PCIe cards. Needs recalibration.
+
+---
+
+## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
+
+Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
+Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
+
+### GPU clock reference (from nvidia-smi, idle):
+- base_clock_mhz: **1095**
+- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
+- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
+- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
+
+### Observed under 700W sustained load (both samples nearly identical):
+- Power: ~700W flat — SXM slot, adequate power confirmed
+- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
+- vs 1980 MHz (lock target): **72–74%** — severely below
+- vs 1755 MHz (nvidia-smi boost): **81–83%**
+- vs 1095 MHz (base): 130% — above base but far below expected for SXM
+- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
+- Temperature: 38°C → 79–80°C (same rate as HBM2e)
+- Oscillation: present, similar character to HBM2e but at much lower frequency
+
+### Diagnosis
+These restored cards are degraded. A healthy H100 SXM in a designed server
+(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
+The 72–74% result is a clear signal of silicon or VRM degradation from the
+refurbishment process.
+
+### Clock pattern note
+Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
+to images 19/20. Both sample sets show same degraded pattern — same batch.
+
+---
+
+## Baseline matrix (filled where data available)
+
+| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
+|---|---|---|---|---|---|
+| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
+| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
+| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
+| H200 NVL | designed | TBD | TBD | TBD | need baseline |
+
+---
+
+## H100 official spec (from NVIDIA datasheet)
+
+Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
+All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
+
+| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
+|---|---|---|---|---|---|
+| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
+| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
+| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
+| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
+
+Notes:
+- SXM boards do NOT list FP8 peak in this table (field empty)
+- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
+- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
+
+## Observed efficiency (H100 80GB PCIe, throttled server)
+
+From the report in this session (power+thermal throttle throughout steady):
+
+| Precision | Measured | Spec (dense) | % of spec |
+|---|---|---|---|
+| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
+| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
+| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
+
+33–44% of spec is expected given sustained power+thermal throttle (avg clock
+1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
+actual frequency — the low TOPS comes from throttle, not silicon defect.
+
+## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
+
+Format: without sparsity / with sparsity.
+
+| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
+|---|---|---|---|---|---|
+| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
+| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
+
+## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
+
+Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
+
+| Precision | Measured | Spec (dense) | % of spec |
+|---|---|---|---|
+| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
+| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
+| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
+
+Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
+both are throttle-limited. Confirms that % of spec is not a quality signal,
+it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
+
+## Real-world GEMM efficiency reference (2026-04-06, web research)
+
+Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
+worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
+
+### What healthy systems actually achieve:
+- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
+- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
+- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
+
+### Our results vs expectation:
+| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
+|---|---|---|---|---|
+| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
+| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
+
+Our results are roughly **half** of what a healthy system achieves even under throttle.
+This is NOT normal — 30-44% is not the industry baseline.
+
+### Likely causes of the gap (in order of probability):
+1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
+2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
+   Previous user may have set a lower limit via nvidia-smi -pl and it was not
+   reset. Our normalization sets clock locks but does NOT reset power limit.
+   Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
+3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
+   8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
+
+### Power limit gap analysis (H100 PCIe):
+- Avg clock 1384 MHz = 79% of boost 1755 MHz
+- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
+- Actually measured: 329 TOPS = 55% of that estimate
+- Remaining gap after accounting for clock throttle: ~45%
+- Most likely explanation: enforced power limit < 350W TDP, further reducing
+  sustainable clock beyond what sw_thermal alone would cause.
+
+### Action item:
+Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
+so result.json shows if the card was pre-configured with a non-default limit.
+If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
+
+### CPU/RAM impact on GPU FLOPS:
+None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
+CPU core count and host RAM are irrelevant.
+
+## Compute efficiency metric (proposed, no hardcode)
+
+Instead of comparing TOPS to a hardcoded spec, compute:
+  tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
+
+This is model-agnostic. A GPU computing correctly at its actual frequency
+will show a consistent tops_per_sm_per_ghz regardless of throttle level.
+A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
+normal clocks.
+
+SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
+(needs to be added to queryBenchmarkGPUInfo).
+
+Reference values to establish after baseline runs:
+- H100 PCIe fp16_tensor: TBD tops/SM/GHz
+- H100 SXM fp16_tensor: TBD tops/SM/GHz
+
+## Proposed threshold changes (pending more data)
+
+1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
+   91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
+   capture the root cause.
+
+2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
+   under power cap. Consider suppressing this flag when power is flat and usage
+   is 100% (oscillation is expected). Or lower threshold to 70.
+
+3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
+   ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
+   would have been caught by this).
+
+Decision deferred until baseline on SXM designed servers collected.
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -19,5 +19,7 @@ ROCRAND_VERSION=3.2.0.60304-76~22.04
 HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
 HIPBLASLT_VERSION=0.10.0.60304-76~22.04
 COMGR_VERSION=2.8.0.60304-76~22.04
+HPL_VERSION=2.3
+HPL_SHA256=32c5c17d22330e6f2337b681aded51637fb6008d3f0eb7c277b163fadd612830
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,7 +32,7 @@ lb config noauto \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -36,7 +36,6 @@ typedef void *CUstream;
 #define MAX_CUBLAS_PROFILES 5
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
-#define STRESS_LAUNCH_DEPTH 8

 static const char *ptx_source =
    ".version 6.0\n"
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
    unsigned long iterations = 0;
    int mp_count = 0;
    int stream_count = 1;
-    int launches_per_wave = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "driver-ptx");
@@ -419,44 +417,42 @@ static int run_ptx_fallback(struct cuda_api *api,

    unsigned int threads = 256;

-    double start = now_seconds();
-    double deadline = start + (double)seconds;
+    double deadline = now_seconds() + (double)seconds;
+    double next_sync = now_seconds() + 1.0;
    while (now_seconds() < deadline) {
-        launches_per_wave = 0;
-        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
-            int launched_this_batch = 0;
-            for (int lane = 0; lane < stream_count; lane++) {
-                unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
-                if (!check_rc(api,
-                              "cuLaunchKernel",
-                              api->cuLaunchKernel(kernel,
-                                                  blocks,
-                                                  1,
-                                                  1,
-                                                  threads,
-                                                  1,
-                                                  1,
-                                                  0,
-                                                  streams[lane],
-                                                  params[lane],
-                                                  NULL))) {
-                    goto fail;
-                }
-                launches_per_wave++;
-                launched_this_batch++;
-            }
-            if (launched_this_batch <= 0) {
-                break;
+        int launched = 0;
+        for (int lane = 0; lane < stream_count; lane++) {
+            unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
+            if (!check_rc(api,
+                          "cuLaunchKernel",
+                          api->cuLaunchKernel(kernel,
+                                              blocks,
+                                              1,
+                                              1,
+                                              threads,
+                                              1,
+                                              1,
+                                              0,
+                                              streams[lane],
+                                              params[lane],
+                                              NULL))) {
+                goto fail;
            }
+            launched++;
+            iterations++;
        }
-        if (launches_per_wave <= 0) {
+        if (launched <= 0) {
            goto fail;
        }
-        if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
-            goto fail;
+        double now = now_seconds();
+        if (now >= next_sync || now >= deadline) {
+            if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
+                goto fail;
+            }
+            next_sync = now + 1.0;
        }
-        iterations += (unsigned long)launches_per_wave;
    }
+    api->cuCtxSynchronize();

    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
        goto fail;
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
    report->iterations = iterations;
    snprintf(report->details,
             sizeof(report->details),
-             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
+             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
             size_mb,
             report->buffer_mb,
             report->stream_count,
-             STRESS_LAUNCH_DEPTH,
             bytes_per_stream[0] / (1024u * 1024u),
             iterations);

@@ -606,6 +601,20 @@ struct prepared_profile {
 };

 static const struct profile_desc k_profiles[] = {
+    {
+        "fp64",
+        "fp64",
+        80,
+        1,
+        0,
+        0,
+        8,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUDA_R_64F,
+        CUBLAS_COMPUTE_64F,
+    },
    {
        "fp32_tf32",
        "fp32",
@@ -1126,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    int stream_count = 1;
    int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
    int prepared_count = 0;
-    int wave_launches = 0;
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
@@ -1193,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
-                  STRESS_LAUNCH_DEPTH,
                  mp_count,
                  per_profile_budget / (1024u * 1024u));

@@ -1246,50 +1253,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

+    /* Keep the GPU queue continuously full by submitting kernels without
+     * synchronizing after every wave.  A sync barrier after each small batch
+     * creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
+     * especially when individual kernels are short.  Instead we sync at most
+     * once per second (for error detection) and once at the very end. */
    double deadline = now_seconds() + (double)seconds;
+    double next_sync = now_seconds() + 1.0;
    while (now_seconds() < deadline) {
-        wave_launches = 0;
-        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
-            int launched_this_batch = 0;
-            for (int i = 0; i < prepared_count; i++) {
-                if (!prepared[i].ready) {
-                    continue;
-                }
-                if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
-                    append_detail(report->details,
-                                  sizeof(report->details),
-                                  "%s=FAILED runtime\n",
-                                  prepared[i].desc.name);
-                    for (int j = 0; j < prepared_count; j++) {
-                        destroy_profile(&cublas, cuda, &prepared[j]);
-                    }
-                    cublas.cublasLtDestroy(handle);
-                    destroy_streams(cuda, streams, stream_count);
-                    cuda->cuCtxDestroy(ctx);
-                    return 0;
-                }
-                prepared[i].iterations++;
-                report->iterations++;
-                wave_launches++;
-                launched_this_batch++;
+        int launched = 0;
+        for (int i = 0; i < prepared_count; i++) {
+            if (!prepared[i].ready) {
+                continue;
            }
-            if (launched_this_batch <= 0) {
-                break;
+            if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
+                append_detail(report->details,
+                              sizeof(report->details),
+                              "%s=FAILED runtime\n",
+                              prepared[i].desc.name);
+                for (int j = 0; j < prepared_count; j++) {
+                    destroy_profile(&cublas, cuda, &prepared[j]);
+                }
+                cublas.cublasLtDestroy(handle);
+                destroy_streams(cuda, streams, stream_count);
+                cuda->cuCtxDestroy(ctx);
+                return 0;
            }
+            prepared[i].iterations++;
+            report->iterations++;
+            launched++;
        }
-        if (wave_launches <= 0) {
+        if (launched <= 0) {
            break;
        }
-        if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
-            for (int i = 0; i < prepared_count; i++) {
-                destroy_profile(&cublas, cuda, &prepared[i]);
+        double now = now_seconds();
+        if (now >= next_sync || now >= deadline) {
+            if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
+                for (int i = 0; i < prepared_count; i++) {
+                    destroy_profile(&cublas, cuda, &prepared[i]);
+                }
+                cublas.cublasLtDestroy(handle);
+                destroy_streams(cuda, streams, stream_count);
+                cuda->cuCtxDestroy(ctx);
+                return 0;
            }
-            cublas.cublasLtDestroy(handle);
-            destroy_streams(cuda, streams, stream_count);
-            cuda->cuCtxDestroy(ctx);
-            return 0;
+            next_sync = now + 1.0;
        }
    }
+    /* Final drain — ensure all queued work finishes before we read results. */
+    cuda->cuCtxSynchronize();

    for (int i = 0; i < prepared_count; i++) {
        if (!prepared[i].ready) {
--- a/iso/builder/build-hpl.sh
+++ b/iso/builder/build-hpl.sh
@@ -0,0 +1,331 @@
+#!/bin/sh
+# build-hpl.sh — build HPL (High Performance LINPACK) for the bee LiveCD.
+#
+# Downloads HPL 2.3 from netlib, downloads OpenBLAS runtime from the Debian 12
+# apt repo, and compiles xhpl using a minimal single-process MPI stub so that
+# no MPI package is required inside the ISO.
+#
+# The resulting xhpl binary is a standard HPL binary whose output is compatible
+# with the accepted HPL format (WR... Gflops lines).
+#
+# Output:
+#   $CACHE_DIR/bin/xhpl
+#   $CACHE_DIR/lib/libopenblas.so*   (runtime, injected into ISO /usr/lib/)
+
+set -e
+
+HPL_VERSION="$1"
+HPL_SHA256="$2"
+DIST_DIR="$3"
+
+[ -n "$HPL_VERSION" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
+[ -n "$HPL_SHA256"  ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
+[ -n "$DIST_DIR"    ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
+
+echo "=== HPL ${HPL_VERSION} ==="
+
+CACHE_DIR="${DIST_DIR}/hpl-${HPL_VERSION}"
+CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
+DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/hpl-downloads"
+
+if [ -x "${CACHE_DIR}/bin/xhpl" ]; then
+    echo "=== HPL cached, skipping build ==="
+    echo "binary: ${CACHE_DIR}/bin/xhpl"
+    exit 0
+fi
+
+mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/bin" "${CACHE_DIR}/lib"
+
+# ── download HPL source ────────────────────────────────────────────────────────
+HPL_TAR="${DOWNLOAD_CACHE_DIR}/hpl-${HPL_VERSION}.tar.gz"
+DEFAULT_HPL_URLS="
+https://www.netlib.org/benchmark/hpl/hpl-${HPL_VERSION}.tar.gz
+https://fossies.org/linux/privat/hpl-${HPL_VERSION}.tar.gz
+"
+HPL_GIT_URL="${HPL_GIT_URL:-https://github.com/icl-utk-edu/hpl.git}"
+DEFAULT_HPL_GIT_REFS="v${HPL_VERSION} ${HPL_VERSION} main"
+HPL_SOURCE_MODE="tarball"
+
+download_to_file() {
+    url="$1"
+    out="$2"
+
+    if command -v curl >/dev/null 2>&1; then
+        curl -fL \
+            --connect-timeout 15 \
+            --max-time 180 \
+            --retry 2 \
+            --retry-delay 2 \
+            --output "${out}" \
+            "${url}"
+        return $?
+    fi
+
+    wget \
+        --show-progress \
+        --tries=2 \
+        --timeout=30 \
+        -O "${out}" \
+        "${url}"
+}
+
+download_hpl_tarball() {
+    out="$1"
+    tmp="${out}.part"
+    urls="${HPL_URLS:-$DEFAULT_HPL_URLS}"
+
+    rm -f "${tmp}"
+    for url in ${urls}; do
+        [ -n "${url}" ] || continue
+        echo "=== trying HPL source: ${url} ==="
+        if download_to_file "${url}" "${tmp}"; then
+            mv "${tmp}" "${out}"
+            return 0
+        fi
+        rm -f "${tmp}"
+        echo "=== failed: ${url} ==="
+    done
+
+    echo "ERROR: failed to download HPL ${HPL_VERSION} from all configured URLs" >&2
+    return 1
+}
+
+download_hpl_from_git_archive() {
+    out="$1"
+    refs="${HPL_GIT_REFS:-$DEFAULT_HPL_GIT_REFS}"
+    tmp_root="$(mktemp -d)"
+    repo_dir="${tmp_root}/repo"
+    archive_dir="${tmp_root}/hpl-${HPL_VERSION}"
+    archive_tmp="${out}.part"
+
+    for ref in ${refs}; do
+        [ -n "${ref}" ] || continue
+        echo "=== trying HPL git source: ${HPL_GIT_URL} ref ${ref} ==="
+        rm -rf "${repo_dir}" "${archive_dir}" "${archive_tmp}"
+        if git clone --depth 1 --branch "${ref}" "${HPL_GIT_URL}" "${repo_dir}"; then
+            mv "${repo_dir}" "${archive_dir}"
+            tar czf "${archive_tmp}" -C "${tmp_root}" "hpl-${HPL_VERSION}"
+            mv "${archive_tmp}" "${out}"
+            rm -rf "${tmp_root}"
+            HPL_SOURCE_MODE="git"
+            return 0
+        fi
+        echo "=== failed git ref: ${ref} ==="
+    done
+
+    rm -rf "${tmp_root}" "${archive_tmp}"
+    echo "ERROR: failed to obtain HPL ${HPL_VERSION} from all configured sources" >&2
+    echo "  looked for cache: ${out}" >&2
+    echo "  tarball mirrors: ${HPL_URLS:-$DEFAULT_HPL_URLS}" >&2
+    echo "  git fallback: ${HPL_GIT_URL} refs ${refs}" >&2
+    echo "  override mirrors with HPL_URLS=\"https://mirror1/...\"" >&2
+    echo "  override git refs with HPL_GIT_REFS=\"v${HPL_VERSION} ${HPL_VERSION} main\"" >&2
+    return 1
+}
+
+if [ ! -f "${HPL_TAR}" ]; then
+    echo "=== downloading HPL ${HPL_VERSION} ==="
+    download_hpl_tarball "${HPL_TAR}" || download_hpl_from_git_archive "${HPL_TAR}"
+fi
+
+if [ "${HPL_SOURCE_MODE}" = "tarball" ]; then
+    actual_sha="$(sha256sum "${HPL_TAR}" | awk '{print $1}')"
+    if [ "${actual_sha}" != "${HPL_SHA256}" ]; then
+        echo "ERROR: sha256 mismatch for hpl-${HPL_VERSION}.tar.gz" >&2
+        echo "  expected: ${HPL_SHA256}" >&2
+        echo "  actual:   ${actual_sha}" >&2
+        rm -f "${HPL_TAR}"
+        exit 1
+    fi
+    echo "sha256 OK: hpl-${HPL_VERSION}.tar.gz"
+else
+    echo "=== HPL source obtained from git fallback; skipping tarball sha256 check ==="
+fi
+
+# ── download OpenBLAS from Debian 12 apt repo ─────────────────────────────────
+REPO_BASE="https://deb.debian.org/debian/pool/main/o/openblas"
+PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
+OPENBLAS_PKG="libopenblas0-openmp"
+
+echo "=== fetching Debian 12 Packages.gz ==="
+wget -q -O "${PACKAGES_GZ}" \
+    "https://deb.debian.org/debian/dists/bookworm/main/binary-amd64/Packages.gz"
+
+lookup_deb() {
+    pkg="$1"
+    gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" '
+        /^Package: / { cur=$2 }
+        /^Filename: / { file=$2 }
+        /^SHA256: /  { sha=$2 }
+        /^$/ {
+            if (cur == pkg) { print file " " sha; exit }
+            cur=""; file=""; sha=""
+        }
+        END {
+            if (cur == pkg) print file " " sha
+        }'
+}
+
+meta="$(lookup_deb "${OPENBLAS_PKG}")"
+[ -n "$meta" ] || { echo "ERROR: ${OPENBLAS_PKG} not found in Packages.gz"; exit 1; }
+repo_file="$(printf '%s' "$meta" | awk '{print $1}')"
+repo_sha="$(printf '%s'  "$meta" | awk '{print $2}')"
+
+OPENBLAS_DEB="${DOWNLOAD_CACHE_DIR}/$(basename "${repo_file}")"
+if [ -f "${OPENBLAS_DEB}" ]; then
+    actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
+    [ "$actual" = "$repo_sha" ] || rm -f "${OPENBLAS_DEB}"
+fi
+if [ ! -f "${OPENBLAS_DEB}" ]; then
+    echo "=== downloading ${OPENBLAS_PKG} ==="
+    wget --show-progress -O "${OPENBLAS_DEB}" "https://deb.debian.org/debian/${repo_file}"
+    actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
+    [ "$actual" = "$repo_sha" ] || { echo "ERROR: sha256 mismatch for ${OPENBLAS_PKG}"; rm -f "${OPENBLAS_DEB}"; exit 1; }
+fi
+
+# extract libopenblas shared libs
+TMP_DEB=$(mktemp -d)
+trap 'rm -rf "${TMP_DEB}" "${BUILD_TMP:-}"' EXIT INT TERM
+(
+    cd "${TMP_DEB}"
+    ar x "${OPENBLAS_DEB}"
+    tar xf data.tar.*
+)
+find "${TMP_DEB}" \( -name 'libopenblas*.so*' \) \( -type f -o -type l \) \
+    -exec cp -a {} "${CACHE_DIR}/lib/" \;
+echo "=== OpenBLAS libs: $(ls "${CACHE_DIR}/lib/" | wc -l) files ==="
+
+# also need libopenblas-dev header for compilation (we only need the .so symlink)
+OPENBLAS_SO="$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libopenblas.so.*' -type f | sort | head -1)"
+[ -n "${OPENBLAS_SO}" ] || { echo "ERROR: libopenblas.so not extracted"; exit 1; }
+SONAME="$(basename "${OPENBLAS_SO}")"
+ln -sf "${SONAME}" "${CACHE_DIR}/lib/libopenblas.so" 2>/dev/null || true
+ln -sf "${SONAME}" "${CACHE_DIR}/lib/libblas.so" 2>/dev/null || true
+
+# ── build HPL ─────────────────────────────────────────────────────────────────
+BUILD_TMP=$(mktemp -d)
+
+cd "${BUILD_TMP}"
+tar xf "${HPL_TAR}"
+SRC_DIR="$(find . -maxdepth 1 -type d -name 'hpl-*' | head -1)"
+[ -n "${SRC_DIR}" ] || { echo "ERROR: HPL source dir not found"; exit 1; }
+cd "${SRC_DIR}"
+
+# Write a minimal single-process MPI stub so we don't need an MPI package.
+# HPL only needs these functions for single-process execution.
+cat > "${BUILD_TMP}/mpi_stub.c" <<'MPISTUB'
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+typedef int MPI_Comm;
+typedef int MPI_Datatype;
+typedef int MPI_Op;
+typedef int MPI_Status;
+typedef int MPI_Request;
+
+#define MPI_COMM_WORLD 0
+#define MPI_SUCCESS    0
+#define MPI_DOUBLE     6
+#define MPI_INT        5
+#define MPI_SUM        0
+#define MPI_MAX        1
+#define MPI_MIN        2
+#define MPI_BYTE       1
+#define MPI_ANY_SOURCE -1
+#define MPI_ANY_TAG    -1
+#define MPI_STATUS_IGNORE ((MPI_Status*)0)
+
+int MPI_Init(int *argc, char ***argv)          { (void)argc; (void)argv; return MPI_SUCCESS; }
+int MPI_Finalize(void)                          { return MPI_SUCCESS; }
+int MPI_Comm_rank(MPI_Comm c, int *rank)        { (void)c; *rank = 0; return MPI_SUCCESS; }
+int MPI_Comm_size(MPI_Comm c, int *size)        { (void)c; *size = 1; return MPI_SUCCESS; }
+int MPI_Bcast(void *b, int n, MPI_Datatype t, int r, MPI_Comm c)
+    { (void)b;(void)n;(void)t;(void)r;(void)c; return MPI_SUCCESS; }
+int MPI_Reduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, int root, MPI_Comm c) {
+    (void)op;(void)root;(void)c;
+    size_t sz = (t==MPI_DOUBLE)?sizeof(double):(t==MPI_INT)?sizeof(int):1;
+    memcpy(r, s, (size_t)n * sz);
+    return MPI_SUCCESS;
+}
+int MPI_Allreduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, MPI_Comm c)
+    { return MPI_Reduce(s,r,n,t,op,0,c); }
+int MPI_Send(const void *b, int n, MPI_Datatype t, int d, int tag, MPI_Comm c)
+    { (void)b;(void)n;(void)t;(void)d;(void)tag;(void)c; return MPI_SUCCESS; }
+int MPI_Recv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Status *st)
+    { (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)st; return MPI_SUCCESS; }
+int MPI_Sendrecv(const void *sb, int sn, MPI_Datatype st2, int dest, int stag,
+                 void *rb, int rn, MPI_Datatype rt, int src, int rtag,
+                 MPI_Comm c, MPI_Status *status)
+    { (void)sb;(void)sn;(void)st2;(void)dest;(void)stag;
+      (void)rb;(void)rn;(void)rt;(void)src;(void)rtag;(void)c;(void)status;
+      return MPI_SUCCESS; }
+int MPI_Irecv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Request *req)
+    { (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)req; return MPI_SUCCESS; }
+int MPI_Wait(MPI_Request *req, MPI_Status *st)
+    { (void)req;(void)st; return MPI_SUCCESS; }
+int MPI_Abort(MPI_Comm c, int code) { (void)c; exit(code); }
+double MPI_Wtime(void) {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
+}
+MPISTUB
+
+# Write Make.bee — HPL makefile configuration
+cat > Make.bee <<MAKEFILE
+SHELL        = /bin/sh
+CD           = cd
+CP           = cp
+LN_S         = ln -s
+MKDIR        = mkdir -p
+RM           = /bin/rm -f
+TOUCH        = touch
+ARCH         = bee
+
+# Directories
+TOPdir       = \$(shell pwd)
+INCdir       = \$(TOPdir)/include
+BINdir       = \$(TOPdir)/bin/\$(ARCH)
+LIBdir       = \$(TOPdir)/lib/\$(ARCH)
+HPLlib       = \$(LIBdir)/libhpl.a
+
+# Compiler
+CC           = gcc
+CCNOOPT      = \$(HPL_DEFS)
+CCFLAGS      = \$(HPL_DEFS) -O3 -march=native -funroll-loops -fomit-frame-pointer
+
+# Linker
+LINKER       = gcc
+LINKFLAGS    = \$(CCFLAGS)
+
+# MPI (single-process stub — no actual MPI needed)
+MPdir        =
+MPinc        = -I${BUILD_TMP}
+MPlib        = ${BUILD_TMP}/mpi_stub.o
+
+# BLAS (OpenBLAS)
+LAdir        = ${CACHE_DIR}/lib
+LAinc        =
+LAlib        = -L\$(LAdir) -Wl,-rpath,/usr/lib -lopenblas
+
+HPL_OPTS     =
+HPL_DEFS     = \$(HPL_OPTS) -DHPL_CALL_CBLAS
+MAKEFILE
+echo "=== Make.bee written ==="
+
+# compile MPI stub
+gcc -O2 -c -o "${BUILD_TMP}/mpi_stub.o" "${BUILD_TMP}/mpi_stub.c"
+
+# build HPL
+echo "=== building HPL ${HPL_VERSION} ==="
+make -j"$(nproc)" arch=bee 2>&1 | tail -20
+
+XHPL_BIN="bin/bee/xhpl"
+[ -x "${XHPL_BIN}" ] || { echo "ERROR: xhpl not found after build"; exit 1; }
+
+cp "${XHPL_BIN}" "${CACHE_DIR}/bin/xhpl"
+chmod +x "${CACHE_DIR}/bin/xhpl"
+echo "=== HPL build complete ==="
+echo "binary: ${CACHE_DIR}/bin/xhpl"
+echo "libs:   $(ls "${CACHE_DIR}/lib/")"
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
+            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
            exit 1
            ;;
    esac
 done

 case "$VARIANT" in
-    nvidia|amd|nogpu|all) ;;
-    *) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
+    nvidia|nvidia-legacy|amd|nogpu|all) ;;
+    *) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
 esac

 if [ "$CLEAN_CACHE" = "1" ]; then
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
           "${CACHE_DIR:?}/lb-packages"
    echo "=== cleaning live-build work dirs ==="
    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
+    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
    rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
    rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
+    rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
    echo "=== caches cleared, proceeding with build ==="
 fi

@@ -180,6 +185,9 @@ case "$VARIANT" in
    nvidia)
        run_variant nvidia
        ;;
+    nvidia-legacy)
+        run_variant nvidia-legacy
+        ;;
    amd)
        run_variant amd
        ;;
@@ -188,6 +196,7 @@ case "$VARIANT" in
        ;;
    all)
        run_variant nvidia
+        run_variant nvidia-legacy
        run_variant amd
        run_variant nogpu
        ;;
--- a/iso/builder/build-nvidia-module.sh
+++ b/iso/builder/build-nvidia-module.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
-# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
+# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
 #
 # Downloads the official NVIDIA .run installer, extracts kernel modules and
-# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
+# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
+#   - open         -> kernel-open/ sources from the .run installer
+#   - proprietary  -> traditional proprietary kernel sources from the .run installer
 #
 # Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
 # are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
@@ -17,10 +19,19 @@ set -e
 NVIDIA_VERSION="$1"
 DIST_DIR="$2"
 DEBIAN_KERNEL_ABI="$3"
+NVIDIA_FLAVOR="${4:-open}"

-[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
-[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
-[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
+
+case "$NVIDIA_FLAVOR" in
+    open|proprietary) ;;
+    *)
+        echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
+        exit 1
+        ;;
+esac

 KVER="${DEBIAN_KERNEL_ABI}-amd64"
 # On Debian, kernel headers are split into two packages:
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
 KDIR_ARCH="/usr/src/linux-headers-${KVER}"
 KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"

-echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
+echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="

-if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
-    echo "=== installing linux-headers-${KVER} ==="
-    DEBIAN_FRONTEND=noninteractive apt-get install -y \
-        "linux-headers-${KVER}" \
-        gcc make perl
-fi
-echo "kernel headers (arch):   $KDIR_ARCH"
-echo "kernel headers (common): $KDIR_COMMON"
-
-CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
+CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
 EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
-CACHE_LAYOUT_VERSION="2"
+CACHE_LAYOUT_VERSION="3"
 CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
 if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
        && [ -f "$CACHE_LAYOUT_MARKER" ] \
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
    exit 0
 fi

+if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
+    echo "=== installing linux-headers-${KVER} ==="
+    DEBIAN_FRONTEND=noninteractive apt-get install -y \
+        "linux-headers-${KVER}" \
+        gcc make perl
+fi
+echo "kernel headers (arch):   $KDIR_ARCH"
+echo "kernel headers (common): $KDIR_COMMON"
+
 # Download official NVIDIA .run installer with sha256 verification
 BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
 mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
 rm -rf "$EXTRACT_DIR"
 "$RUN_FILE" --extract-only --target "$EXTRACT_DIR"

-# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
+# Find kernel source directory for the selected flavor.
 KERNEL_SRC=""
-for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
-    [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
-done
-[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
+if [ "$NVIDIA_FLAVOR" = "open" ]; then
+    for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
+        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
+    done
+else
+    for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
+        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
+    done
+fi
+[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
 echo "kernel source: $KERNEL_SRC"

 # Build kernel modules
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -15,26 +15,46 @@ DIST_DIR="${REPO_ROOT}/dist"
 VENDOR_DIR="${REPO_ROOT}/iso/vendor"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 AUTH_KEYS=""
+BUILD_VARIANT="nvidia"
 BEE_GPU_VENDOR="nvidia"
+BEE_NVIDIA_MODULE_FLAVOR="open"

 # parse args
 while [ $# -gt 0 ]; do
    case "$1" in
        --authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
-        --variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
+        --variant) BUILD_VARIANT="$2"; shift 2 ;;
        *) echo "unknown arg: $1"; exit 1 ;;
    esac
 done

-case "$BEE_GPU_VENDOR" in
-    nvidia|amd|nogpu) ;;
-    *) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
+case "$BUILD_VARIANT" in
+    nvidia)
+        BEE_GPU_VENDOR="nvidia"
+        BEE_NVIDIA_MODULE_FLAVOR="open"
+        ;;
+    nvidia-legacy)
+        BEE_GPU_VENDOR="nvidia"
+        BEE_NVIDIA_MODULE_FLAVOR="proprietary"
+        ;;
+    amd)
+        BEE_GPU_VENDOR="amd"
+        BEE_NVIDIA_MODULE_FLAVOR=""
+        ;;
+    nogpu)
+        BEE_GPU_VENDOR="nogpu"
+        BEE_NVIDIA_MODULE_FLAVOR=""
+        ;;
+    *)
+        echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
+        exit 1
+        ;;
 esac

-BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
-OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
+BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
+OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"

-export BEE_GPU_VENDOR
+export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT

 . "${BUILDER_DIR}/VERSIONS"
 export PATH="$PATH:/usr/local/go/bin"
@@ -627,7 +647,7 @@ recover_iso_memtest() {

 AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
 ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
-ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
+ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
 # Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
 OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
 mkdir -p "${OUT_DIR}"
@@ -801,7 +821,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
    apt-get install -y "linux-headers-${KVER}"
 fi

-echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
 echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
 echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
 echo ""
@@ -871,7 +891,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    fi
 fi

-echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
+echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"

 # Sync builder config into variant work dir, preserving lb cache.
@@ -897,6 +917,86 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi

+if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
+    cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
+source /boot/grub/config.cfg
+
+echo ""
+echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
+echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
+echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
+echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
+echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
+echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
+echo "  Hardware Audit LiveCD"
+echo ""
+
+menuentry "EASY-BEE" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
+}
+
+submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — KMS (no nomodeset)" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }
+
+    menuentry "EASY-BEE — fail-safe" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+        initrd  @INITRD_LIVE@
+    }
+}
+
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "Memory Test (memtest86+)" {
+        chainloader /boot/memtest86+x64.efi
+    }
+else
+    menuentry "Memory Test (memtest86+)" {
+        linux16 /boot/memtest86+x64.bin
+    }
+fi
+
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "UEFI Firmware Settings" {
+        fwsetup
+    }
+fi
+EOF
+
+    cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
+label live-@FLAVOUR@-normal
+    menu label ^EASY-BEE
+    menu default
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@
+
+label live-@FLAVOUR@-kms
+    menu label EASY-BEE (^graphics/KMS)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.display=kms
+
+label live-@FLAVOUR@-toram
+    menu label EASY-BEE (^load to RAM)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ toram
+
+label live-@FLAVOUR@-failsafe
+    menu label EASY-BEE (^fail-safe)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+
+label memtest
+    menu label ^Memory Test (memtest86+)
+    linux /boot/memtest86+x64.bin
+EOF
+fi
+
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -981,10 +1081,10 @@ done
 # --- NVIDIA kernel modules and userspace libs ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
-        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
+        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"

    KVER="${DEBIAN_KERNEL_ABI}-amd64"
-    NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
+    NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"

    # Inject .ko files into overlay at /usr/local/lib/nvidia/
    OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
@@ -1048,6 +1148,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    echo "=== john injected ==="
 fi

+# --- build HPL (CPU LINPACK) — runs on all variants ---
+run_step "build HPL ${HPL_VERSION}" "80-hpl" \
+    sh "${BUILDER_DIR}/build-hpl.sh" "${HPL_VERSION}" "${HPL_SHA256}" "${DIST_DIR}"
+
+HPL_CACHE="${DIST_DIR}/hpl-${HPL_VERSION}"
+mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee"
+cp "${HPL_CACHE}/bin/xhpl" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
+chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
+chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-hpl" 2>/dev/null || true
+# Inject OpenBLAS runtime libs needed by xhpl
+cp "${HPL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
+echo "=== HPL injected: xhpl + $(ls "${HPL_CACHE}/lib/" | wc -l) OpenBLAS libs ==="
+
 # --- embed build metadata ---
 mkdir -p "${OVERLAY_STAGE_DIR}/etc"
 BUILD_DATE="$(date +%Y-%m-%d)"
@@ -1055,13 +1168,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u

 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
+NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
 NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
 CUBLAS_VERSION=${CUBLAS_VERSION}
 CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
 NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
 JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
-    GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
+    GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
    GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
    GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
@@ -1073,16 +1187,23 @@ fi
 cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
 BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
 BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
+BEE_BUILD_VARIANT=${BUILD_VARIANT}
 BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
 BUILD_DATE=${BUILD_DATE}
 GIT_COMMIT=${GIT_COMMIT}
 DEBIAN_VERSION=${DEBIAN_VERSION}
 DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
+HPL_VERSION=${HPL_VERSION}
 ${GPU_VERSION_LINE}
 EOF

 # Write GPU vendor marker for hooks
 echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
+if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
+    echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
+else
+    rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
+fi

 # Patch motd with build info
 BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
@@ -1153,10 +1274,10 @@ fi

 # --- build ISO using live-build ---
 echo ""
-echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="

 # Export for auto/config
-BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
+BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER

 cd "${LB_DIR}"
@@ -1191,7 +1312,7 @@ if [ -f "$ISO_RAW" ]; then
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
-    echo "=== done (${BEE_GPU_VENDOR}) ==="
+    echo "=== done (${BUILD_VARIANT}) ==="
    echo "ISO: $ISO_OUT"
    if command -v stat >/dev/null 2>&1; then
        ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -7,6 +7,7 @@ echo "  █████╗  ███████║███████╗ ╚
 echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
 echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
 echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
+echo "  Hardware Audit LiveCD"
 echo ""

 menuentry "EASY-BEE" {
@@ -14,29 +15,21 @@ menuentry "EASY-BEE" {
    initrd  @INITRD_LIVE@
 }

-menuentry "EASY-BEE (graphics/KMS)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
+submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — GSP=off" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }

-menuentry "EASY-BEE (load to RAM)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
+    menuentry "EASY-BEE — KMS (no nomodeset)" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }

-menuentry "EASY-BEE (NVIDIA GSP=off)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-menuentry "EASY-BEE (fail-safe)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-    initrd  @INITRD_LIVE@
+    menuentry "EASY-BEE — fail-safe" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+        initrd  @INITRD_LIVE@
+    }
 }

 if [ "${grub_platform}" = "efi" ]; then
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -31,6 +31,7 @@ systemctl enable bee-audit.service
 systemctl enable bee-web.service
 systemctl enable bee-sshsetup.service
 systemctl enable bee-selfheal.timer
+systemctl enable bee-boot-status.service
 systemctl enable ssh.service
 systemctl enable lightdm.service 2>/dev/null || true
 systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -59,7 +60,8 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
-chmod +x /usr/local/bin/bee-selfheal   2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal      2>/dev/null || true
+chmod +x /usr/local/bin/bee-boot-status  2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -0,0 +1,111 @@
+#!/bin/sh
+# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
+set -e
+echo "=== generating bee wallpaper ==="
+mkdir -p /usr/share/bee
+
+python3 - <<'PYEOF'
+from PIL import Image, ImageDraw, ImageFont, ImageFilter
+import os
+
+W, H = 1920, 1080
+
+ASCII_ART = [
+    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
+    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
+    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
+    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
+    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
+    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
+]
+SUBTITLE = "  Hardware Audit LiveCD"
+
+FG = (0xF6, 0xD0, 0x47)
+FG_DIM = (0xD4, 0xA9, 0x1C)
+SHADOW = (0x5E, 0x47, 0x05)
+SUB = (0x96, 0x7A, 0x17)
+BG = (0x05, 0x05, 0x05)
+
+MONO_FONT_CANDIDATES = [
+    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
+    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
+]
+SUB_FONT_CANDIDATES = [
+    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
+    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
+]
+
+
+def load_font(candidates, size):
+    for path in candidates:
+        if os.path.exists(path):
+            return ImageFont.truetype(path, size)
+    return ImageFont.load_default()
+
+
+def mono_metrics(font):
+    probe = Image.new('L', (W, H), 0)
+    draw = ImageDraw.Draw(probe)
+    char_w = int(round(draw.textlength("M", font=font)))
+    bb = draw.textbbox((0, 0), "Mg", font=font)
+    char_h = bb[3] - bb[1]
+    return char_w, char_h
+
+
+def render_ascii_mask(font, lines, char_w, char_h, line_gap):
+    width = max(len(line) for line in lines) * char_w
+    height = len(lines) * char_h + line_gap * (len(lines) - 1)
+    mask = Image.new('L', (width, height), 0)
+    draw = ImageDraw.Draw(mask)
+    for row, line in enumerate(lines):
+        y = row * (char_h + line_gap)
+        for col, ch in enumerate(line):
+            if ch == ' ':
+                continue
+            x = col * char_w
+            draw.text((x, y), ch, font=font, fill=255)
+    return mask
+
+
+img = Image.new('RGB', (W, H), BG)
+draw = ImageDraw.Draw(img)
+
+# Soft amber glow under the logo without depending on font rendering.
+glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
+glow_draw = ImageDraw.Draw(glow)
+glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
+glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
+glow = glow.filter(ImageFilter.GaussianBlur(60))
+img = Image.alpha_composite(img.convert('RGBA'), glow)
+
+font_logo = load_font(MONO_FONT_CANDIDATES, 64)
+char_w, char_h = mono_metrics(font_logo)
+logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 8)
+logo_w, logo_h = logo_mask.size
+logo_x = (W - logo_w) // 2
+logo_y = 270
+
+shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
+img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
+img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
+img.paste(FG, (logo_x, logo_y), logo_mask)
+
+font_sub = load_font(SUB_FONT_CANDIDATES, 30)
+sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
+sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
+sub_y = logo_y + logo_h + 48
+draw = ImageDraw.Draw(img)
+draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
+draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
+
+img = img.convert('RGB')
+
+img.save('/usr/share/bee/wallpaper.png', optimize=True)
+print('wallpaper written: /usr/share/bee/wallpaper.png')
+PYEOF
+
+echo "=== wallpaper done ==="
--- a/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
+++ b/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
@@ -0,0 +1,41 @@
+#!/bin/sh
+# 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
+#
+# live-boot tries "losetup --replace --direct-io=on" when re-associating the
+# loop device to the RAM copy in /dev/shm.  tmpfs does not support O_DIRECT,
+# so the ioctl returns EINVAL and the verification step fails.
+#
+# The patch replaces the replace call so that if --direct-io=on fails it falls
+# back to a plain replace without direct-io, and also relaxes the verification
+# to a warning so the boot continues even when re-association is imperfect.
+set -e
+
+TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
+
+if [ ! -f "${TORAM_SCRIPT}" ]; then
+    echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
+    exit 0
+fi
+
+echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
+
+# Replace any losetup --replace call that includes --direct-io=on with a
+# version that first tries with direct-io, then retries without it.
+#
+# The sed expression turns:
+#   losetup --replace ... --direct-io=on LOOP FILE
+# into a shell snippet that tries both, silently.
+#
+# We also downgrade the fatal "Task finished with error." block to a warning
+# so the boot continues if re-association fails (squashfs still accessible).
+
+# 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
+sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
+sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
+
+# 2. Turn the hard error into a warning so boot continues.
+#    live-boot prints this exact string when verification fails.
+sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
+
+echo "9010-fix-toram: patch applied"
+grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -60,9 +60,15 @@ qrencode
 # Local desktop (openbox + chromium kiosk)
 openbox
 tint2
+feh
+python3-pil
 xorg
 xterm
 chromium
+mousepad
+pcmanfm
+ristretto
+mupdf
 xserver-xorg-video-fbdev
 xserver-xorg-video-vesa
 lightdm
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -27,6 +27,7 @@ echo ""
 KVER=$(uname -r)
 info "kernel: $KVER"
 NVIDIA_BOOT_MODE="normal"
+NVIDIA_MODULES_FLAVOR="proprietary"
 for arg in $(cat /proc/cmdline 2>/dev/null); do
    case "$arg" in
        bee.nvidia.mode=*)
@@ -34,7 +35,11 @@ for arg in $(cat /proc/cmdline 2>/dev/null); do
            ;;
    esac
 done
+if [ -f /etc/bee-nvidia-modules-flavor ]; then
+    NVIDIA_MODULES_FLAVOR="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null || echo proprietary)"
+fi
 info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
+info "nvidia modules flavor: ${NVIDIA_MODULES_FLAVOR}"

 # --- PATH & binaries ---
 echo "-- PATH & binaries --"
@@ -110,10 +115,12 @@ fi
 for mod in nvidia_modeset nvidia_uvm; do
    if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
        ok "module loaded: $mod"
-    elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+    elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
        fail "module NOT loaded in normal mode: $mod"
-    else
+    elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
        warn "module not loaded in GSP-off mode: $mod"
+    else
+        fail "module NOT loaded: $mod"
    fi
 done

@@ -129,10 +136,12 @@ done

 if [ -e /dev/nvidia-uvm ]; then
    ok "/dev/nvidia-uvm exists"
-elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
    fail "/dev/nvidia-uvm missing in normal mode"
-else
+elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
    warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
+else
+    fail "/dev/nvidia-uvm missing"
 fi

 echo ""
--- a/iso/overlay/etc/systemd/system/bee-boot-status.service
+++ b/iso/overlay/etc/systemd/system/bee-boot-status.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Bee: boot status display
+After=systemd-user-sessions.service
+Before=getty@tty1.service
+
+[Service]
+Type=oneshot
+RemainAfterExit=no
+ExecStart=/usr/local/bin/bee-boot-status
+TTYPath=/dev/tty1
+StandardInput=tty
+StandardOutput=tty
+StandardError=tty
+TTYReset=yes
+TTYVHangup=yes
+
+[Install]
+WantedBy=multi-user.target
--- a/iso/overlay/etc/systemd/system/getty@tty1.service.d/wait-bee.conf
+++ b/iso/overlay/etc/systemd/system/getty@tty1.service.d/wait-bee.conf
@@ -0,0 +1,2 @@
+[Unit]
+After=bee-boot-status.service
--- a/iso/overlay/etc/systemd/system/lightdm.service.d/bee-display-mode.conf
+++ b/iso/overlay/etc/systemd/system/lightdm.service.d/bee-display-mode.conf
@@ -1,6 +1,4 @@
 [Unit]
-Wants=bee-preflight.service
-After=bee-preflight.service

 [Service]
 ExecStartPre=/usr/local/bin/bee-display-mode
--- a/iso/overlay/usr/local/bin/bee-boot-status
+++ b/iso/overlay/usr/local/bin/bee-boot-status
@@ -0,0 +1,89 @@
+#!/bin/sh
+# bee-boot-status — boot progress display on tty1.
+# Shows live service status until all bee services are done or failed,
+# then exits so getty can show the login prompt.
+
+CRITICAL="bee-preflight bee-nvidia bee-audit"
+ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
+
+svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
+
+svc_icon() {
+    case "$(svc_state "$1")" in
+        active)       printf '\033[32m[  OK  ]\033[0m' ;;
+        failed)       printf '\033[31m[ FAIL ]\033[0m' ;;
+        activating)   printf '\033[33m[  ..  ]\033[0m' ;;
+        deactivating) printf '\033[33m[ stop ]\033[0m' ;;
+        inactive)     printf '\033[90m[      ]\033[0m' ;;
+        *)            printf '\033[90m[  ?   ]\033[0m' ;;
+    esac
+}
+
+svc_detail() {
+    local svc="$1" state
+    state="$(svc_state "$svc")"
+    case "$state" in
+        failed)
+            local res
+            res="$(systemctl show -p Result "$svc.service" 2>/dev/null | cut -d= -f2)"
+            [ -n "$res" ] && [ "$res" != "success" ] && printf '  \033[31m(%s)\033[0m' "$res"
+            ;;
+        activating)
+            local line
+            line="$(journalctl -u "$svc.service" -n 1 --no-pager --output=cat 2>/dev/null | cut -c1-55)"
+            [ -n "$line" ] && printf '  \033[90m%s\033[0m' "$line"
+            ;;
+    esac
+}
+
+all_critical_done() {
+    for svc in $CRITICAL; do
+        case "$(svc_state "$svc")" in
+            active|failed|inactive) ;;
+            *) return 1 ;;
+        esac
+    done
+    return 0
+}
+
+while true; do
+    # move to top-left and clear screen
+    printf '\033[H\033[2J'
+
+    printf '\n'
+    printf '  \033[33m███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗\033[0m\n'
+    printf '  \033[33m██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝\033[0m\n'
+    printf '  \033[33m█████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗\033[0m\n'
+    printf '  \033[33m██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝\033[0m\n'
+    printf '  \033[33m███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗\033[0m\n'
+    printf '  \033[33m╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝\033[0m\n'
+    printf '  Hardware Audit LiveCD\n'
+    printf '\n'
+
+    for svc in $ALL; do
+        printf '  %s  %-20s%s\n' "$(svc_icon "$svc")" "$svc" "$(svc_detail "$svc")"
+    done
+    printf '\n'
+
+    # Network
+    ips="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{printf "  %-16s %s\n", $NF, $2}')"
+    if [ -n "$ips" ]; then
+        printf '  \033[1mNetwork:\033[0m\n'
+        printf '%s\n' "$ips"
+        printf '\n'
+    fi
+
+    if all_critical_done; then
+        printf '  \033[1;32mSystem ready.\033[0m  Audit is running in the background.\n'
+        first_ip="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1 | head -1)"
+        if [ -n "$first_ip" ]; then
+            printf '  Web UI: \033[1mhttp://%s/\033[0m\n' "$first_ip"
+        fi
+        printf '\n'
+        sleep 3
+        break
+    fi
+
+    printf '  \033[90mStarting up...\033[0m\n'
+    sleep 3
+done
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -62,6 +62,8 @@ done
 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+
 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM

@@ -78,7 +80,8 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
        fi
    fi
    echo "starting gpu ${id} size=${gpu_size_mb}MB"
-    "${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+    CUDA_VISIBLE_DEVICES="${id}" \
+        "${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
 done
--- a/iso/overlay/usr/local/bin/bee-hpl
+++ b/iso/overlay/usr/local/bin/bee-hpl
@@ -0,0 +1,97 @@
+#!/bin/sh
+# bee-hpl — run HPL (High Performance LINPACK) with auto-sized problem.
+#
+# Generates HPL.dat based on available RAM, runs xhpl, and prints standard
+# HPL output. The WR... line with Gflops is parsed by the bee audit tool.
+#
+# Usage: bee-hpl [--mem-fraction 0.80] [--nb 256] [--seconds N]
+#
+# --mem-fraction   fraction of total RAM to use for the matrix (default 0.80)
+# --nb             block size; 256 is good for modern CPUs (default 256)
+# --seconds        ignored — HPL runtime is determined by problem size; kept
+#                  for interface compatibility with other bee stress tools
+
+set -eu
+
+XHPL="/usr/local/lib/bee/xhpl"
+MEM_FRACTION="0.80"
+NB=256
+
+usage() {
+    echo "usage: $0 [--mem-fraction 0.80] [--nb 256] [--seconds N]" >&2
+    exit 2
+}
+
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        --mem-fraction) [ "$#" -ge 2 ] || usage; MEM_FRACTION="$2"; shift 2 ;;
+        --nb)           [ "$#" -ge 2 ] || usage; NB="$2"; shift 2 ;;
+        --seconds)      [ "$#" -ge 2 ] || usage; shift 2 ;;  # accepted, ignored
+        *) usage ;;
+    esac
+done
+
+[ -x "${XHPL}" ] || { echo "ERROR: xhpl not found at ${XHPL}" >&2; exit 1; }
+
+# Detect total RAM in bytes
+TOTAL_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}')
+[ -n "${TOTAL_KB}" ] || { echo "ERROR: cannot read MemTotal from /proc/meminfo" >&2; exit 1; }
+TOTAL_BYTES=$(( TOTAL_KB * 1024 ))
+
+# N = floor(sqrt(fraction * total_bytes / 8)) rounded down to multiple of NB
+# Use awk for floating-point sqrt
+N=$(awk -v total="${TOTAL_BYTES}" -v frac="${MEM_FRACTION}" -v nb="${NB}" '
+BEGIN {
+    raw = int(sqrt(total * frac / 8.0))
+    n   = int(raw / nb) * nb
+    if (n < nb) n = nb
+    print n
+}')
+
+echo "loader=bee-hpl"
+echo "total_ram_mb=$(( TOTAL_KB / 1024 ))"
+echo "matrix_n=${N}"
+echo "block_nb=${NB}"
+echo "mem_fraction=${MEM_FRACTION}"
+
+# Generate HPL.dat in a temp directory and run from there
+RUNDIR=$(mktemp -d)
+trap 'rm -rf "${RUNDIR}"' EXIT INT TERM
+
+cat > "${RUNDIR}/HPL.dat" <<DAT
+HPLinpack benchmark input file
+Innovative Computing Laboratory, University of Tennessee
+HPL.out        output file name (if any)
+6              device out (6=stdout, 7=stderr, file)
+1              # of problems sizes (N)
+${N}           Ns
+1              # of NBs
+${NB}          NBs
+0              PMAP process mapping (0=Row-,1=Column-major)
+1              # of process grids (P x Q)
+1              Ps
+1              Qs
+16.0           threshold
+1              # of panel fact
+2              PFACTs (0=left, 1=Crout, 2=Right)
+1              # of recursive stopping criterium
+4              NBMINs (>= 1)
+1              # of panels in recursion
+2              NDIVs
+1              # of recursive panel fact.
+1              RFACTs (0=left, 1=Crout, 2=Right)
+1              # of broadcast
+1              BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+1              # of lookahead depth
+1              DEPTHs (>=0)
+2              SWAP (0=bin-exch,1=long,2=mix)
+64             swapping threshold
+0              L1 in (0=transposed,1=no-transposed) form
+0              U  in (0=transposed,1=no-transposed) form
+1              Equilibration (0=no,1=yes)
+8              memory alignment in double (> 0)
+DAT
+
+cd "${RUNDIR}"
+echo "---"
+"${XHPL}"
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -152,14 +152,19 @@ done

 [ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+export CUDA_VISIBLE_DEVICES="${FINAL}"
+
 JOHN_DEVICES=""
+local_id=1
 for id in $(echo "${FINAL}" | tr ',' ' '); do
-    opencl_id=$((id + 1))
+    opencl_id="${local_id}"
    if [ -z "${JOHN_DEVICES}" ]; then
        JOHN_DEVICES="${opencl_id}"
    else
        JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
    fi
+    local_id=$((local_id + 1))
 done

 echo "loader=john"
--- a/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
 echo "range=${MIN_BYTES}..${MAX_BYTES}"
 echo "iters=${ITERS}"

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+
 deadline=$(( $(date +%s) + SECONDS ))
 round=0

--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -6,6 +6,19 @@ NVIDIA_KO_DIR="/usr/local/lib/nvidia"

 log() { echo "[bee-nvidia] $*"; }

+read_nvidia_modules_flavor() {
+    if [ -f /etc/bee-nvidia-modules-flavor ]; then
+        flavor="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null)"
+        case "$flavor" in
+            open|proprietary)
+                echo "$flavor"
+                return 0
+                ;;
+        esac
+    fi
+    echo "proprietary"
+}
+
 log "kernel: $(uname -r)"

 # Skip if no NVIDIA GPU present (PCI vendor 10de)
@@ -40,6 +53,8 @@ if [ -z "$nvidia_mode" ]; then
    nvidia_mode="normal"
 fi
 log "boot mode: $nvidia_mode"
+nvidia_modules_flavor="$(read_nvidia_modules_flavor)"
+log "modules flavor: $nvidia_modules_flavor"

 load_module() {
    mod="$1"
@@ -50,11 +65,93 @@ load_module() {
        log "WARN: not found: $ko"
        return 1
    fi
-    if insmod "$ko" "$@"; then
+    if timeout 90 insmod "$ko" "$@"; then
        log "loaded: $mod $*"
        return 0
    fi
-    log "WARN: failed to load: $mod"
+    log "WARN: failed to load: $mod (exit $?)"
+    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
+    return 1
+}
+
+nvidia_is_functional() {
+    grep -q ' nvidiactl$' /proc/devices 2>/dev/null
+}
+
+load_module_with_gsp_fallback() {
+    ko="$NVIDIA_KO_DIR/nvidia.ko"
+    if [ ! -f "$ko" ]; then
+        log "ERROR: not found: $ko"
+        return 1
+    fi
+
+    # Run insmod in background — on some converted SXM→PCIe cards GSP enters an
+    # infinite crash/reload loop and insmod never returns. We check for successful
+    # initialization by polling /proc/devices for nvidiactl instead of waiting for
+    # insmod to exit.
+    log "loading nvidia (GSP enabled, timeout 90s)"
+    insmod "$ko" &
+    _insmod_pid=$!
+
+    _waited=0
+    while [ $_waited -lt 90 ]; do
+        if nvidia_is_functional; then
+            log "loaded: nvidia (GSP enabled, ${_waited}s)"
+            echo "gsp-on" > /run/bee-nvidia-mode
+            return 0
+        fi
+        # Check if insmod exited with an error before timeout
+        if ! kill -0 "$_insmod_pid" 2>/dev/null; then
+            wait "$_insmod_pid"
+            _rc=$?
+            if [ $_rc -ne 0 ]; then
+                log "nvidia load failed (exit $_rc)"
+                dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
+                return 1
+            fi
+            # insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
+            sleep 2
+            if nvidia_is_functional; then
+                log "loaded: nvidia (GSP enabled, ${_waited}s)"
+                return 0
+            fi
+            log "insmod exited 0 but nvidiactl missing — treating as failure"
+            return 1
+        fi
+        sleep 1
+        _waited=$((_waited + 1))
+    done
+
+    # GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
+    log "nvidia GSP init timed out after 90s"
+    kill "$_insmod_pid" 2>/dev/null || true
+    wait "$_insmod_pid" 2>/dev/null || true
+
+    # Attempt to unload the partially-initialized module
+    if ! rmmod nvidia 2>/dev/null; then
+        # Module is stuck in the kernel — cannot reload with different params.
+        # User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
+        log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
+        log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
+        echo "gsp-stuck" > /run/bee-nvidia-mode
+        return 1
+    fi
+
+    sleep 2
+    log "retrying with NVreg_EnableGpuFirmware=0"
+    log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
+
+    if insmod "$ko" NVreg_EnableGpuFirmware=0; then
+        if nvidia_is_functional; then
+            log "loaded: nvidia (GSP disabled)"
+            echo "gsp-off" > /run/bee-nvidia-mode
+            return 0
+        fi
+        log "insmod gsp-off exited 0 but nvidiactl missing"
+        return 1
+    fi
+
+    log "nvidia load failed (GSP=off)"
    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
    return 1
 }
@@ -68,37 +165,54 @@ load_host_module() {
    return 1
 }

-case "$nvidia_mode" in
-    normal|full)
-        if ! load_module nvidia; then
-            exit 1
-        fi
-        # nvidia-modeset on some server kernels needs ACPI video helper symbols
-        # exported by the generic "video" module. Best-effort only; compute paths
-        # remain functional even if display-related modules stay absent.
-        load_host_module video || true
-        load_module nvidia-modeset || true
-        load_module nvidia-uvm || true
-        ;;
-    gsp-off|safe)
-        # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
-        # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
-        # conservative path for platforms where full boot-time GSP init is unstable.
-        if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
-            exit 1
-        fi
-        log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
-        ;;
-    nomsi|*)
-        # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
-        # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
-        # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
-        if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
-            exit 1
-        fi
-        log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
-        ;;
-esac
+if [ "$nvidia_modules_flavor" = "open" ]; then
+    case "$nvidia_mode" in
+        gsp-off|safe|nomsi)
+            log "ignoring boot mode ${nvidia_mode} for open NVIDIA modules"
+            ;;
+    esac
+    if ! load_module nvidia; then
+        exit 1
+    fi
+    # nvidia-modeset on some server kernels needs ACPI video helper symbols
+    # exported by the generic "video" module. Best-effort only; compute paths
+    # remain functional even if display-related modules stay absent.
+    load_host_module video || true
+    load_module nvidia-modeset || true
+    load_module nvidia-uvm || true
+else
+    case "$nvidia_mode" in
+        normal|full)
+            if ! load_module_with_gsp_fallback; then
+                exit 1
+            fi
+            # nvidia-modeset on some server kernels needs ACPI video helper symbols
+            # exported by the generic "video" module. Best-effort only; compute paths
+            # remain functional even if display-related modules stay absent.
+            load_host_module video || true
+            load_module nvidia-modeset || true
+            load_module nvidia-uvm || true
+            ;;
+        gsp-off|safe)
+            # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
+            # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
+            # conservative path for platforms where full boot-time GSP init is unstable.
+            if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
+                exit 1
+            fi
+            log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
+            ;;
+        nomsi|*)
+            # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
+            # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
+            # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
+            if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
+                exit 1
+            fi
+            log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
+            ;;
+    esac
+fi

 # Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
 nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
@@ -127,6 +241,18 @@ fi
 ldconfig 2>/dev/null || true
 log "ldconfig refreshed"

+# Keep persistence mode enabled across the session so dcgmi / stress tools do
+# not fail with deployment warnings on otherwise healthy GPUs.
+if command -v nvidia-smi >/dev/null 2>&1; then
+    if nvidia-smi -pm 1 >/dev/null 2>&1; then
+        log "enabled NVIDIA persistence mode"
+    else
+        log "WARN: failed to enable NVIDIA persistence mode"
+    fi
+else
+    log "WARN: nvidia-smi not found — cannot enable persistence mode"
+fi
+
 # Start DCGM host engine so dcgmi can discover GPUs.
 # nv-hostengine must run after the NVIDIA modules and device nodes are ready.
 # If it started too early (for example via systemd before bee-nvidia-load), it can
--- a/iso/overlay/usr/local/bin/bee-openbox-session
+++ b/iso/overlay/usr/local/bin/bee-openbox-session
@@ -7,16 +7,24 @@ xset s off
 xset -dpms
 xset s noblank

+# Set desktop background.
+if [ -f /usr/share/bee/wallpaper.png ]; then
+    feh --bg-fill /usr/share/bee/wallpaper.png
+else
+    xsetroot -solid '#f6c90e'
+fi
+
 tint2 &

-# Wait up to 120s for bee-web to bind. The web server starts immediately now
-# (audit is deferred), so this should succeed in a few seconds on most hardware.
-i=0
-while [ $i -lt 120 ]; do
-    if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
+# Wait up to 60s for bee-web before opening Chromium.
+# Without this Chromium gets connection-refused and shows a blank page.
+_i=0
+while [ $_i -lt 60 ]; do
+    curl -sf http://localhost/healthz >/dev/null 2>&1 && break
    sleep 1
-    i=$((i+1))
+    _i=$((_i+1))
 done
+unset _i

 chromium \
    --disable-infobars \
@@ -24,7 +32,8 @@ chromium \
    --no-first-run \
    --disable-session-crashed-bubble \
    --disable-features=TranslateUI \
+    --user-data-dir=/tmp/bee-chrome \
    --start-maximized \
-    http://localhost/ &
+    http://localhost/loading &

 exec openbox
Author	SHA1	Message	Date
Mikhail Chusavitin	3f41a026ca	Add resilient HPL source fallbacks	2026-04-08 09:25:31 +03:00
Mikhail Chusavitin	0ee4f46537	Restore MOTD-style ASCII wallpaper	2026-04-08 09:14:27 +03:00
Michael Chus	8db40b098a	Update bible submodule Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 07:14:31 +03:00
Michael Chus	16e7ae00e7	Add HPL (LINPACK) benchmark as validate/stress task HPL 2.3 from netlib compiled against OpenBLAS with a minimal single-process MPI stub — no MPI package required in the ISO. Matrix size is auto-sized to 80% of total RAM at runtime. Build: - VERSIONS: HPL_VERSION=2.3, HPL_SHA256=32c5c17d… - build-hpl.sh: downloads HPL + OpenBLAS from Debian 12 repo, compiles xhpl with a self-contained mpi_stub.c - build.sh: step 80-hpl, injects xhpl + libopenblas into overlay Runtime: - bee-hpl: generates HPL.dat (N auto from /proc/meminfo, NB=256, P=1 Q=1), runs xhpl, prints standard WR... Gflops output - platform/hpl.go: RunHPL(), parses WR line → GFlops + PASSED/FAILED - tasks.go: target "hpl" - pages.go: LINPACK (HPL) card in validate/stress grid (stress-only) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 07:08:18 +03:00
Michael Chus	b2f8626fee	Refactor validate modes, fix benchmark report and IPMI power - Replace diag level 1-4 dropdown with Validate/Stress radio buttons - Validate: dcgmi L2, 60s CPU, 256MB/1p memtester, SMART short - Stress: dcgmi L3 + targeted_stress in Run All, 30min CPU, 1GB/3p memtester, SMART long/NVMe extended - Parallel GPU mode: spawn single task for all GPUs instead of splitting per model - Benchmark table: per-GPU columns for sequential runs, server-wide column for parallel - Benchmark report converted to Markdown with server model, GPU model, version in header; only steady-state charts - Fix IPMI power parsing in benchmark (was looking for 'Current Power', correct field is 'Instantaneous power reading') Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:42:12 +03:00
Michael Chus	dd26e03b2d	Add multi-GPU selector option for system-level tests Adds a "Multi-GPU tests — use all GPUs" checkbox to the NVIDIA GPU selector (checked by default). When enabled, PSU Pulse, NCCL, and NVBandwidth tests run on every GPU in the system regardless of the per-GPU selection above — which is required for correct PSU stress testing (synchronous pulses across all GPUs create worst-case transients). When unchecked, only the manually selected GPUs are used. The same logic applies both to Run All (expandSATTarget) and to the individual Run button on each multi-GPU test card. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:25:12 +03:00
Michael Chus	6937a4c6ec	Fix pulse_test: run all GPUs simultaneously, not per-GPU pulse_test is a PSU/power-delivery test, not a per-GPU compute test. Its purpose is to synchronously pulse all GPUs between idle and full load to create worst-case transient spikes on the power supply. Running it one GPU at a time would produce a fraction of the PSU load and miss any PSU-level failures. - Move nvidia-pulse from nvidiaPerGPUTargets to nvidiaAllGPUTargets (same dispatch path as NCCL and NVBandwidth) - Change card onclick to runNvidiaFabricValidate (all selected GPUs at once) - Update card title to "NVIDIA PSU Pulse Test" and description to explain why synchronous multi-GPU execution is required Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:19:11 +03:00
Michael Chus	b9be93c213	Move NCCL interconnect and NVBandwidth tests to validate/stress nvidia-interconnect (NCCL all_reduce_perf) and nvidia-bandwidth (NVBandwidth) verify fabric connectivity and bandwidth — they are not sustained burn loads. Move both from the Burn section to the Validate section under the stress-mode toggle, alongside the other DCGM diagnostic tests moved in the previous commit. - Add sat-card-nvidia-interconnect and sat-card-nvidia-bandwidth validate cards (stress-only, all selected GPUs at once) - Add runNvidiaFabricValidate() for all-GPU-at-once dispatch - Add nvidiaAllGPUTargets handling in expandSATTarget/runAllSAT - Remove Interconnect / Bandwidth card from Burn section - Remove nvidia-interconnect and nvidia-bandwidth from runAllBurnTasks and the gpu/tools availability map Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:16:42 +03:00
Michael Chus	d1a22d782d	Move power diag tests to validate/stress; fix GPU burn power saturation - bee-gpu-stress.c: remove per-wave cuCtxSynchronize barrier in both cuBLASLt and PTX hot loops; sync at most once/sec so the GPU queue stays continuously full — eliminates the CPU↔GPU ping-pong that prevented reaching full TDP - sat_fan_stress.go: default SizeMB 0 (auto = 95% VRAM) instead of hardcoded 64 MB; tiny matrices caused <0.1 ms kernels where CPU re-queue overhead dominated - pages.go: move nvidia-targeted-power and nvidia-pulse from Burn → Validate stress section alongside nvidia-targeted-stress; these are DCGM pass/fail diagnostics, not sustained burn loads; remove the Power Delivery / Power Budget card from Burn entirely Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:13:52 +03:00
Mikhail Chusavitin	0a4bb596f6	Improve install-to-RAM verification for ISO boots	2026-04-07 20:21:06 +03:00
Mikhail Chusavitin	531d1ca366	Add NVIDIA self-heal tools and per-GPU SAT status	2026-04-07 20:20:05 +03:00
Mikhail Chusavitin	93cfa78e8c	Benchmark: parallel GPU mode, resilient inventory query, server model in results - Add parallel GPU mode (checkbox, off by default): runs all selected GPUs simultaneously via a single bee-gpu-burn invocation instead of sequentially; per-GPU telemetry, throttle counters, TOPS, and scoring are preserved - Make queryBenchmarkGPUInfo resilient: falls back to a base field set when extended fields (attribute.multiprocessor_count, power.default_limit) cause exit status 2, preventing lgc normalization from being silently skipped - Log explicit "graphics clock lock skipped" note when inventory is unavailable - Collect server model from DMI (/sys/class/dmi/id/product_name) and store in result JSON; benchmark history columns now show "Server Model (N× GPU Model)" grouped by server+GPU type rather than individual GPU index Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-07 18:32:15 +03:00
Mikhail Chusavitin	1358485f2b	fix logo wallpaper	2026-04-07 10:15:38 +03:00
Michael Chus	8fe20ba678	Fix benchmark scoring: PowerSustain uses default power limit PowerSustainScore now uses DefaultPowerLimitW as reference so a manually reduced power limit does not inflate the score. Falls back to enforced limit if default is unavailable. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 22:30:59 +03:00
Michael Chus	d973231f37	Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check - Sample server power (IPMI dcmi) during baseline+steady phases in parallel; compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting - Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w from nvidia-smi alongside existing GPU info - Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal) - Flag when enforced power limit is below default TDP by >5% - Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0) - Improve Executive Summary: overall pass count, FAILED GPU finding - Throttle counters now shown as % of steady window instead of raw microseconds - bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 22:26:52 +03:00
Michael Chus	f5d175f488	Fix toram: patch live-boot to not use O_DIRECT when replacing loop to tmpfs losetup --replace --direct-io=on fails with EINVAL when the target file is on tmpfs (/dev/shm), because tmpfs does not support O_DIRECT. Strip the --direct-io flag from the replace call and downgrade the verification failure to a warning so boot continues. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 21:06:21 +03:00
Michael Chus	fa00667750	Refactor NVIDIA GPU Selection into standalone card on validate page Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 21:06:16 +03:00
Mikhail Chusavitin	c7d2816a7f	Limit NVIDIA legacy boot hooks to proprietary ISO	2026-04-06 16:33:16 +03:00
Mikhail Chusavitin	d2eadedff2	Default NVIDIA ISO to open modules and add nvidia-legacy	2026-04-06 16:27:13 +03:00
Mikhail Chusavitin	a98c4d7461	Include terminal charts in benchmark report	2026-04-06 12:34:57 +03:00
Mikhail Chusavitin	2354ae367d	Normalize task IDs and artifact folder prefixes	2026-04-06 12:26:47 +03:00
Mikhail Chusavitin	0d0e1f55a7	Avoid misleading SAT summaries after task cancellation	2026-04-06 12:24:19 +03:00
Mikhail Chusavitin	35f4c53887	Stabilize NVIDIA GPU device mapping across loaders	2026-04-06 12:22:04 +03:00
Mikhail Chusavitin	981315e6fd	Split NVIDIA tasks by homogeneous GPU groups	2026-04-06 11:58:13 +03:00
Mikhail Chusavitin	fc5c100a29	Fix NVIDIA persistence mode and add benchmark results table	2026-04-06 10:47:07 +03:00
Michael Chus	6e94216f3b	Hide task charts while pending	2026-04-05 22:34:34 +03:00
Michael Chus	53455063b9	Stabilize live task detail page	2026-04-05 22:14:52 +03:00
Michael Chus	4602f97836	Enforce sequential task orchestration	2026-04-05 22:10:42 +03:00
Michael Chus	c65d3ae3b1	Add nomodeset to default GRUB entry — fix black screen on headless servers Servers with NVIDIA compute GPUs (H100 etc.) have no display output, so KMS blanks the console. nomodeset disables kernel modesetting and lets the NVIDIA proprietary driver handle display via Xorg. KMS variant moved to advanced submenu for cases where it is needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 21:40:47 +03:00
Michael Chus	7a21c370e4	Handle NVIDIA GSP firmware init hang with timeout fallback - bee-nvidia-load: run insmod in background, poll /proc/devices for nvidiactl; if GSP init doesn't complete in 90s, kill insmod and retry with NVreg_EnableGpuFirmware=0. Handles EBUSY case with clear error. - Write /run/bee-nvidia-mode (gsp-on/gsp-off/gsp-stuck) for audit layer - Show GSP mode badge in sidebar: yellow for gsp-off, red for gsp-stuck - Report NvidiaGSPMode in RuntimeHealth with issue entries - Simplify GRUB menu: default (KMS+GSP), advanced submenu (GSP=off, nomodeset, fail-safe), remove load-to-RAM entry - Add pcmanfm, ristretto, mupdf, mousepad to desktop packages Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 21:00:43 +03:00
Michael Chus	a493e3ab5b	Fix service control buttons: sudo, real error output, UX feedback - services.go: use sudo systemctl so bee user can control system services - api.go: always return 200 with output field even on error, so the frontend shows the actual systemctl message instead of "exit status 1" - pages.go: button shows "..." while pending then restores label; output panel is full-width under the table with ✓/✗ status indicator; output auto-scrolls to bottom Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:25:41 +03:00
Michael Chus	19b4803ec7	Pass exact cycle duration to GPU stress instead of 86400s sentinel bee-gpu-burn now receives --seconds <LoadSec> so it exits naturally when the cycle ends, rather than relying solely on context cancellation to kill it. Process group kill (Setpgid+Cancel) is kept as a safety net for early cancellation (user stop, context timeout). Same fix for AMD RVS which now gets duration_ms = LoadSec * 1000. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:22:43 +03:00
Michael Chus	1bdfb1e9ca	Fix nvidia-targeted-stress failing with DCGM_ST_IN_USE (-34) nvvs (DCGM validation suite) survives when dcgmi is killed mid-run, leaving the GPU occupied. The next dcgmi diag invocation then fails with "affected resource is in use". Two-part fix: - Add nvvs and dcgmi to KillTestWorkers patterns so they are cleaned up by the global cancel handler - Call KillTestWorkers at the start of RunNvidiaTargetedStressValidatePack to clear any stale processes before dcgmi diag runs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:21:36 +03:00
Michael Chus	c5d6b30177	Fix platform thermal cycling leaving GPU load running after test ends bee-gpu-burn is a shell script that spawns bee-gpu-burn-worker children. exec.CommandContext default cancel only kills the shell parent; the worker processes survive and keep loading the GPU indefinitely. Fix: set Setpgid=true and a custom Cancel that sends SIGKILL to the entire process group (-pid), same pattern already used in runSATCommandCtx. Applied to Nvidia, AMD, and CPU stress commands for consistency. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:19:20 +03:00
Michael Chus	5b9015451e	Add live task charts and fix USB export actions	2026-04-05 20:14:23 +03:00
Michael Chus	d1a6863ceb	Use amber fallback wallpaper color (#f6c90e) instead of black Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:30:41 +03:00
Michael Chus	f9aa05de8e	Add wallpaper: black background with amber EASY-BEE ASCII art logo - Add feh and python3-pil to package list - Add chroot hook that generates /usr/share/bee/wallpaper.png using PIL: black background, EASY-BEE box-drawing logo in amber (#f6c90e), "Hardware Audit LiveCD" subtitle in dim amber — matches motd exactly - bee-openbox-session: set wallpaper with feh --bg-fill, fall back to xsetroot -solid black if wallpaper not found Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:29:42 +03:00
Michael Chus	a9ccea8cca	Fix black desktop and Chromium blank page on startup - Set xsetroot solid background (#12100a, dark amber) so openbox doesn't show bare black before Chromium opens - Re-add healthz wait loop before launching Chromium: without it Chromium opens localhost/loading before bee-web is up and gets connection-refused which renders as a blank white page Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:25:32 +03:00
Michael Chus	fc5c985fb5	Reset tty1 properly when bee-boot-status exits Add TTYReset=yes and TTYVHangup=yes so systemd restores the terminal to a clean state before handing tty1 to getty. Without this the screen went black with no cursor after the status display finished. Also remove DefaultDependencies=no which was too aggressive. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:22:01 +03:00
Michael Chus	5eb3baddb4	Fix bee-boot-status blank screen caused by variable buffering Command substitution in sh strips trailing newlines, so accumulating output in a variable via $(...) lost all line breaks. Reverted to direct printf calls which work correctly. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:21:10 +03:00
Michael Chus	a6ac13b5d3	Improve bee-boot-status: slower refresh, more detail - Refresh every 3s instead of 1s to reduce flicker - Show ssh, bee-sshsetup in service list - Show failure reason for failed services - Show last journal line for activating services - Show IP addresses and web UI URL when network is up - Render frame to variable before printing to reduce flicker Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:20:07 +03:00
Michael Chus	4003cb7676	Lower kernel console loglevel to 3 to reduce boot noise loglevel=6 floods the screen with mpt3sas/scsi/sd informational messages, hiding systemd service status and bee-boot-status display. loglevel=3 shows only kernel errors; all messages still go to serial. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:19:09 +03:00
Michael Chus	2875313ba0	Improve boot UX: status display, faster GUI, loading spinner - Add bee-boot-status service: shows live service status on tty1 with ASCII logo before getty, exits when all bee services settle - Remove lightdm dependency on bee-preflight so GUI starts immediately without waiting for NVIDIA driver load - Replace Chromium blank-page problem with /loading spinner page that polls /api/services and auto-redirects when services are ready; add "Open app now" override button; use fresh --user-data-dir=/tmp/bee-chrome - Unify branding: add "Hardware Audit LiveCD" subtitle to GRUB menu, bee-boot-status (with yellow ASCII logo), and web spinner Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 18:58:24 +03:00
Michael Chus	f1621efee4	Mirror task lifecycle to serial console	2026-04-05 18:34:06 +03:00
Michael Chus	4461249cc3	Make memory stress size follow available RAM	2026-04-05 18:33:26 +03:00
Michael Chus	e609fbbc26	Add task reports and streamline GPU charts	2026-04-05 18:13:58 +03:00
Michael Chus	cc2b49ea41	Improve validate GPU runs and web UI feedback	2026-04-05 17:50:13 +03:00
Michael Chus	33e0a5bef2	Refine validate UI and runtime health table	2026-04-05 16:24:45 +03:00