Improve install-to-RAM verification for ISO boots

Add NVIDIA self-heal tools and per-GPU SAT status
Benchmark: parallel GPU mode, resilient inventory query, server model in results
2026-04-07 20:21:06 +03:00 · 2026-04-07 20:20:05 +03:00 · 2026-04-07 18:32:15 +03:00 · 2026-04-07 10:15:38 +03:00 · 2026-04-06 22:30:59 +03:00 · 2026-04-06 22:26:52 +03:00
26 changed files with 2016 additions and 158 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -122,6 +122,8 @@ type satRunner interface {
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
 	ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
 	ResetNvidiaGPU(index int) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
@@ -521,6 +523,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return a.sat.ListNvidiaGPUs()
 }
 func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
 	return a.sat.ListNvidiaGPUStatuses()
 }
 func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
 	out, err := a.sat.ResetNvidiaGPU(index)
 	return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
 }
 func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -135,6 +135,8 @@ type fakeSAT struct {
 	listAMDGPUsFn             func() ([]platform.AMDGPUInfo, error)
 	runAMDPackFn              func(string) (string, error)
 	listNvidiaGPUsFn          func() ([]platform.NvidiaGPU, error)
 	listNvidiaGPUStatusesFn   func() ([]platform.NvidiaGPUStatus, error)
 	resetNvidiaGPUFn          func(int) (string, error)
 }
 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -201,6 +203,20 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return nil, nil
 }
 func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
 	if f.listNvidiaGPUStatusesFn != nil {
 		return f.listNvidiaGPUStatusesFn()
 	}
 	return nil, nil
 }
 func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
 	if f.resetNvidiaGPUFn != nil {
 		return f.resetNvidiaGPUFn(index)
 	}
 	return "", nil
 }
 func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
 	return f.runMemoryFn(baseDir)
 }
@@ -805,6 +821,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	for _, want := range []string{
 		"/system/ip-link.txt",
 		"/system/ip-link-stats.txt",
 		"/system/kernel-aer-nvidia.txt",
 		"/system/lspci-nvidia-bridges-vv.txt",
 		"/system/pcie-aer-sysfs.txt",
 		"/system/ethtool-info.txt",
 		"/system/ethtool-link.txt",
 		"/system/ethtool-module.txt",
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -3,6 +3,7 @@ package app
 import (
 	"os"
 	"path/filepath"
 	"strconv"
 	"sort"
 	"strings"
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
 		applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
 		applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
 		applyMemorySAT(snap.Memory, summary)
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
 	applyComponentStatusDB(snap, db)
 }
 type nvidiaPerGPUStatus struct {
 	runStatus string
 	reason    string
 }
 func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
 	statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
 	if !ok {
 		return
 	}
 	for i := range devs {
 		if devs[i].Telemetry == nil {
 			continue
 		}
 		rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
 		if !ok {
 			continue
 		}
 		idx, ok := telemetryInt(rawIdx)
 		if !ok {
 			continue
 		}
 		st, ok := statusByIndex[idx]
 		if !ok {
 			continue
 		}
 		status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
 		if !ok {
 			continue
 		}
 		mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
 	}
 }
 func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
 	matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
 	if err != nil || len(matches) == 0 {
 		return nil, "", false
 	}
 	sort.Strings(matches)
 	runDir := matches[len(matches)-1]
 	summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
 	if err != nil {
 		return nil, "", false
 	}
 	summaryKV := parseKeyValueSummary(string(summaryRaw))
 	runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
 	files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
 	if err != nil || len(files) == 0 {
 		return nil, "", false
 	}
 	out := make(map[int]nvidiaPerGPUStatus, len(files))
 	for _, file := range files {
 		raw, err := os.ReadFile(file)
 		if err != nil {
 			continue
 		}
 		kv := parseKeyValueSummary(string(raw))
 		idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
 		if err != nil {
 			continue
 		}
 		out[idx] = nvidiaPerGPUStatus{
 			runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
 			reason:    strings.TrimSpace(kv["reason"]),
 		}
 	}
 	if len(out) == 0 {
 		return nil, "", false
 	}
 	return out, runAtUTC, true
 }
 func telemetryInt(v any) (int, bool) {
 	switch value := v.(type) {
 	case int:
 		return value, true
 	case int32:
 		return int(value), true
 	case int64:
 		return int(value), true
 	case float64:
 		return int(value), true
 	case string:
 		n, err := strconv.Atoi(strings.TrimSpace(value))
 		if err != nil {
 			return 0, false
 		}
 		return n, true
 	default:
 		return 0, false
 	}
 }
 type satSummary struct {
 	runAtUTC string
 	overall  string
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
 	}
 }
 func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
 	if component == nil || satStatus == "" {
 		return
 	}
 	current := strings.TrimSpace(ptrString(component.Status))
 	newSeverity := statusSeverity(satStatus)
 	currentSeverity := statusSeverity(current)
 	if current == "" || current == "Unknown" || newSeverity > currentSeverity {
 		mergeComponentStatus(component, changedAt, satStatus, description)
 		return
 	}
 	if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
 		component.Status = appStringPtr(satStatus)
 		component.ErrorDescription = appStringPtr(description)
 		if strings.TrimSpace(changedAt) != "" {
 			component.StatusChangedAt = appStringPtr(changedAt)
 			component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
 				Status:    satStatus,
 				ChangedAt: changedAt,
 				Details:   appStringPtr(description),
 			})
 		}
 	}
 }
 func statusSeverity(status string) int {
 	switch strings.TrimSpace(status) {
 	case "Critical":
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
 	}
 }
 func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
 	baseDir := t.TempDir()
 	runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
 		t.Fatal(err)
 	}
 	class := "VideoController"
 	manufacturer := "NVIDIA Corporation"
 	bdf0 := "0000:4b:00.0"
 	bdf1 := "0000:4f:00.0"
 	snap := schema.HardwareSnapshot{
 		PCIeDevices: []schema.HardwarePCIeDevice{
 			{
 				DeviceClass:  &class,
 				Manufacturer: &manufacturer,
 				BDF:          &bdf0,
 				Telemetry:    map[string]any{"nvidia_gpu_index": 0},
 			},
 			{
 				DeviceClass:  &class,
 				Manufacturer: &manufacturer,
 				BDF:          &bdf1,
 				Telemetry:    map[string]any{"nvidia_gpu_index": 1},
 			},
 		},
 	}
 	applyLatestSATStatuses(&snap, baseDir, nil)
 	if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
 		t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
 	}
 	if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
 		got := "<nil>"
 		if snap.PCIeDevices[1].ErrorDescription != nil {
 			got = *snap.PCIeDevices[1].ErrorDescription
 		}
 		t.Fatalf("gpu1 error=%q want per-gpu reason", got)
 	}
 }
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -40,7 +40,36 @@ var supportBundleCommands = []struct {
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
 	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
 	{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
 if command -v dmesg >/dev/null 2>&1; then
  dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
 else
  echo "dmesg not found"
 fi
 `}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
 	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
 if ! command -v lspci >/dev/null 2>&1; then
  echo "lspci not found"
  exit 0
 fi
 found=0
 for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
  found=1
  echo "=== GPU $gpu ==="
  lspci -s "$gpu" -vv 2>&1 || true
  bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
  if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
    echo
    echo "=== UPSTREAM $bridge for $gpu ==="
    lspci -s "$bridge" -vv 2>&1 || true
  fi
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no NVIDIA PCI devices found"
 fi
 `}},
 	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
 for d in /sys/bus/pci/devices/*/; do
  vendor=$(cat "$d/vendor" 2>/dev/null)
@@ -51,6 +80,30 @@ for d in /sys/bus/pci/devices/*/; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
  done
 done
 `}},
 	{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
 found=0
 for dev in /sys/bus/pci/devices/*; do
  [ -e "$dev" ] || continue
  bdf=$(basename "$dev")
  block=""
  for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
    if [ -r "$dev/$f" ]; then
      if [ -z "$block" ]; then
        block=1
        found=1
        echo "=== $bdf ==="
      fi
      printf "  %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
    fi
  done
  if [ -n "$block" ]; then
    echo
  fi
 done
 if [ "$found" -eq 0 ]; then
  echo "no PCIe AER sysfs counters found"
 fi
 `}},
 	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -13,6 +13,7 @@ import (
 const nvidiaVendorID = 0x10de
 type nvidiaGPUInfo struct {
 	Index              int
 	BDF                string
 	Serial             string
 	VBIOS              string
@@ -132,6 +133,7 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		}
 		info := nvidiaGPUInfo{
 			Index:              parseRequiredInt(rec[0]),
 			BDF:                bdf,
 			Serial:             strings.TrimSpace(rec[2]),
 			VBIOS:              strings.TrimSpace(rec[3]),
@@ -187,6 +189,14 @@ func parseMaybeInt(v string) *int {
 	return &n
 }
 func parseRequiredInt(v string) int {
 	n, err := strconv.Atoi(strings.TrimSpace(v))
 	if err != nil {
 		return 0
 	}
 	return n
 }
 func pcieLinkGenLabel(gen int) string {
 	return fmt.Sprintf("Gen%d", gen)
 }
@@ -240,6 +250,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
 }
 func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
 	if dev.Telemetry == nil {
 		dev.Telemetry = map[string]any{}
 	}
 	dev.Telemetry["nvidia_gpu_index"] = info.Index
 	if info.TemperatureC != nil {
 		dev.TemperatureC = info.TemperatureC
 	}
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -86,6 +86,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
 	if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
 		t.Fatalf("firmware: got %v", out[0].Firmware)
 	}
 	if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
 		t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
 	}
 	if out[0].Status == nil || *out[0].Status != statusWarning {
 		t.Fatalf("status: got %v", out[0].Status)
 	}
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -27,14 +27,17 @@ type benchmarkProfileSpec struct {
 }
 type benchmarkGPUInfo struct {
-	Index               int
+	Index                int
-	UUID                string
+	UUID                 string
-	Name                string
+	Name                 string
-	BusID               string
+	BusID                string
-	VBIOS               string
+	VBIOS                string
-	PowerLimitW         float64
+	PowerLimitW          float64
-	MaxGraphicsClockMHz float64
+	DefaultPowerLimitW   float64
-	MaxMemoryClockMHz   float64
+	MaxGraphicsClockMHz  float64
 	MaxMemoryClockMHz    float64
 	BaseGraphicsClockMHz float64
 	MultiprocessorCount  int
 }
 type benchmarkBurnProfile struct {
@@ -102,7 +105,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		BenchmarkVersion:   benchmarkVersion,
 		GeneratedAt:        time.Now().UTC(),
 		Hostname:           hostname,
 		ServerModel:        readServerModel(),
 		BenchmarkProfile:   spec.Name,
 		ParallelGPUs:       opts.ParallelGPUs,
 		SelectedGPUIndices: append([]int(nil), selected...),
 		Normalization: BenchmarkNormalization{
 			Status: "full",
@@ -111,6 +116,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
 	// Server power characterization state — populated during per-GPU phases.
 	var serverIdleW, serverLoadedWSum float64
 	var serverIdleOK, serverLoadedOK bool
 	var serverLoadedSamples int
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
@@ -135,6 +145,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()
 	if opts.ParallelGPUs {
 		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
 	} else {
 	for _, idx := range selected {
 		gpuResult := BenchmarkGPUResult{
 			Index:  idx,
@@ -146,7 +160,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			gpuResult.BusID = info.BusID
 			gpuResult.VBIOS = info.VBIOS
 			gpuResult.PowerLimitW = info.PowerLimitW
 			gpuResult.MultiprocessorCount = info.MultiprocessorCount
 			gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
 			gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
 			gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
@@ -161,6 +178,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
 		// Sample server idle power once (first GPU only — server state is global).
 		if !serverIdleOK {
 			if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
 				serverIdleW = w
 				serverIdleOK = true
 				logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
 			}
 		}
 		warmupCmd := []string{
 			"bee-gpu-burn",
 			"--seconds", strconv.Itoa(spec.WarmupSec),
@@ -184,7 +210,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			"--devices", strconv.Itoa(idx),
 		}
 		logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
 		// Sample server power via IPMI in parallel with the steady phase.
 		// We collect readings every 5s and average them.
 		ipmiStopCh := make(chan struct{})
 		ipmiResultCh := make(chan float64, 1)
 		go func() {
 			defer close(ipmiResultCh)
 			var samples []float64
 			ticker := time.NewTicker(5 * time.Second)
 			defer ticker.Stop()
 			// First sample after a short warmup delay.
 			select {
 			case <-ipmiStopCh:
 				return
 			case <-time.After(15 * time.Second):
 			}
 			for {
 				if w, err := queryIPMIServerPowerW(); err == nil {
 					samples = append(samples, w)
 				}
 				select {
 				case <-ipmiStopCh:
 					if len(samples) > 0 {
 						var sum float64
 						for _, w := range samples {
 							sum += w
 						}
 						ipmiResultCh <- sum / float64(len(samples))
 					}
 					return
 				case <-ticker.C:
 				}
 			}
 		}()
 		steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
 		close(ipmiStopCh)
 		if loadedW, ok := <-ipmiResultCh; ok {
 			serverLoadedWSum += loadedW
 			serverLoadedSamples++
 			serverLoadedOK = true
 			logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
 		}
 		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
 		afterThrottle, _ := queryThrottleCounters(idx)
 		if steadyErr != nil {
@@ -222,6 +291,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
 	}
 	} // end sequential path
 	if len(selected) > 1 && opts.RunNCCL {
 		result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
 		if result.Interconnect != nil && result.Interconnect.Supported {
@@ -232,6 +303,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}
 	// Compute server power characterization from accumulated IPMI samples.
 	var gpuReportedSumW float64
 	for _, gpu := range result.GPUs {
 		gpuReportedSumW += gpu.Steady.AvgPowerW
 	}
 	var serverLoadedW float64
 	if serverLoadedSamples > 0 {
 		serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
 	}
 	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
 	result.Findings = buildBenchmarkFindings(result)
 	result.OverallStatus = benchmarkOverallStatus(result)
@@ -288,50 +370,87 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 	}
 }
-func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
+// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
-	args := []string{
+// Fields are tried in order; the first successful query wins. Extended fields
-		"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
+// (attribute.multiprocessor_count, power.default_limit) are not supported on
-		"--format=csv,noheader,nounits",
+// all driver versions, so we fall back to the base set if the full query fails.
-	}
+var benchmarkGPUInfoQueries = []struct {
-	if len(gpuIndices) > 0 {
+	fields   string
-		args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
+	extended bool // whether this query includes optional extended fields
-	}
+}{
-	out, err := satExecCommand("nvidia-smi", args...).Output()
+	{
-	if err != nil {
+		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
-		return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
+		extended: true,
-	}
+	},
-
+	{
-	r := csv.NewReader(strings.NewReader(string(out)))
+		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
-	r.TrimLeadingSpace = true
+		extended: false,
-	r.FieldsPerRecord = -1
+	},
 	rows, err := r.ReadAll()
 	if err != nil {
 		return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
 	}
 	infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
 	for _, row := range rows {
 		if len(row) < 8 {
 			continue
 		}
 		idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
 		if err != nil {
 			continue
 		}
 		infoByIndex[idx] = benchmarkGPUInfo{
 			Index:               idx,
 			UUID:                strings.TrimSpace(row[1]),
 			Name:                strings.TrimSpace(row[2]),
 			BusID:               strings.TrimSpace(row[3]),
 			VBIOS:               strings.TrimSpace(row[4]),
 			PowerLimitW:         parseBenchmarkFloat(row[5]),
 			MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
 			MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
 		}
 	}
 	return infoByIndex, nil
 }
 func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
 	var lastErr error
 	for _, q := range benchmarkGPUInfoQueries {
 		args := []string{
 			"--query-gpu=" + q.fields,
 			"--format=csv,noheader,nounits",
 		}
 		if len(gpuIndices) > 0 {
 			args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
 		}
 		out, err := satExecCommand("nvidia-smi", args...).Output()
 		if err != nil {
 			lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
 			continue
 		}
 		r := csv.NewReader(strings.NewReader(string(out)))
 		r.TrimLeadingSpace = true
 		r.FieldsPerRecord = -1
 		rows, err := r.ReadAll()
 		if err != nil {
 			lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
 			continue
 		}
 		infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
 		for _, row := range rows {
 			if len(row) < 9 {
 				continue
 			}
 			idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
 			if err != nil {
 				continue
 			}
 			info := benchmarkGPUInfo{
 				Index:               idx,
 				UUID:                strings.TrimSpace(row[1]),
 				Name:                strings.TrimSpace(row[2]),
 				BusID:               strings.TrimSpace(row[3]),
 				VBIOS:               strings.TrimSpace(row[4]),
 				PowerLimitW:         parseBenchmarkFloat(row[5]),
 				MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
 				MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
 			}
 			if len(row) >= 9 {
 				info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
 			}
 			if q.extended {
 				if len(row) >= 10 {
 					info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
 				}
 				if len(row) >= 11 {
 					info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
 				}
 			}
 			infoByIndex[idx] = info
 		}
 		return infoByIndex, nil
 	}
 	return nil, lastErr
 }
 func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
 	if os.Geteuid() != 0 {
 		result.Normalization.Status = "partial"
@@ -370,6 +489,10 @@ func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndi
 					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
 				}})
 			}
 		} else {
 			rec.GPUClockLockStatus = "skipped"
 			rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
 			result.Normalization.Status = "partial"
 		}
 		if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
@@ -551,6 +674,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
 	}
 	category := "other"
 	switch {
 	case strings.HasPrefix(name, "fp64"):
 		category = "fp64"
 	case strings.HasPrefix(name, "fp32"):
 		category = "fp32_tf32"
 	case strings.HasPrefix(name, "fp16"):
@@ -619,14 +744,23 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 			score.ComputeScore += precision.TeraOpsPerSec
 		}
 	}
-	if gpu.PowerLimitW > 0 {
+	// Use default power limit for sustain score so a manually reduced limit
-		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100)
+	// does not inflate the score. Fall back to enforced limit if default unknown.
 	referencePowerW := gpu.DefaultPowerLimitW
 	if referencePowerW <= 0 {
 		referencePowerW = gpu.PowerLimitW
 	}
 	if referencePowerW > 0 {
 		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
 	}
 	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
 	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
 	score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
 	score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
 	score.CompositeScore = compositeBenchmarkScore(score)
 	if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
 		score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
 	}
 	return score
 }
@@ -798,10 +932,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
 func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 	var findings []string
 	passed := 0
 	for _, gpu := range result.GPUs {
 		if gpu.Status == "OK" {
 			passed++
 		}
 	}
 	total := len(result.GPUs)
 	if total > 0 {
 		if passed == total {
 			findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
 		} else {
 			findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
 		}
 	}
 	if result.Normalization.Status != "full" {
 		findings = append(findings, "Environment normalization was partial; compare results with caution.")
 	}
 	for _, gpu := range result.GPUs {
 		if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
 			findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
 			continue
 		}
 		if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
 			findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
 			continue
@@ -825,10 +979,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 		if gpu.Backend == "driver-ptx" {
 			findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
 		}
 		if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
 			findings = append(findings, fmt.Sprintf(
 				"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
 				gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
 			))
 		}
 	}
 	if result.Interconnect != nil && result.Interconnect.Supported {
 		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
 	}
 	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
 		if sp.ReportingRatio < 0.75 {
 			findings = append(findings, fmt.Sprintf(
 				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
 				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
 			))
 		}
 	}
 	return dedupeStrings(findings)
 }
@@ -1007,3 +1175,319 @@ func maxInt(a, b int) int {
 	}
 	return b
 }
 // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
 // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
 func queryIPMIServerPowerW() (float64, error) {
 	out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
 	if err != nil {
 		return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
 	}
 	for _, line := range strings.Split(string(out), "\n") {
 		if strings.Contains(line, "Current Power") {
 			parts := strings.SplitN(line, ":", 2)
 			if len(parts) == 2 {
 				val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
 				val = strings.TrimSpace(val)
 				w, err := strconv.ParseFloat(val, 64)
 				if err == nil && w > 0 {
 					return w, nil
 				}
 			}
 		}
 	}
 	return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
 }
 // sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
 // durationSec seconds. Returns the mean of all successful samples.
 // Returns 0, false if IPMI is unavailable.
 func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
 	if durationSec <= 0 {
 		return 0, false
 	}
 	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
 	var samples []float64
 	for {
 		if w, err := queryIPMIServerPowerW(); err == nil {
 			samples = append(samples, w)
 		}
 		if time.Now().After(deadline) {
 			break
 		}
 		select {
 		case <-ctx.Done():
 			break
 		case <-time.After(2 * time.Second):
 		}
 	}
 	if len(samples) == 0 {
 		return 0, false
 	}
 	var sum float64
 	for _, w := range samples {
 		sum += w
 	}
 	return sum / float64(len(samples)), true
 }
 // characterizeServerPower computes BenchmarkServerPower from idle and loaded
 // IPMI samples plus the GPU-reported average power during steady state.
 func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
 	sp := &BenchmarkServerPower{Available: ipmiAvailable}
 	if !ipmiAvailable {
 		sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
 		return sp
 	}
 	sp.IdleW = idleW
 	sp.LoadedW = loadedW
 	sp.DeltaW = loadedW - idleW
 	sp.GPUReportedSumW = gpuReportedSumW
 	if gpuReportedSumW > 0 && sp.DeltaW > 0 {
 		sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
 	}
 	return sp
 }
 // readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
 // Returns empty string if unavailable (non-Linux or missing DMI entry).
 func readServerModel() string {
 	data, err := os.ReadFile("/sys/class/dmi/id/product_name")
 	if err != nil {
 		return ""
 	}
 	return strings.TrimSpace(string(data))
 }
 // filterRowsByGPU returns only the metric rows for a specific GPU index.
 func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
 	var out []GPUMetricRow
 	for _, r := range rows {
 		if r.GPUIndex == gpuIndex {
 			out = append(out, r)
 		}
 	}
 	return out
 }
 // parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
 // and returns a per-GPU parse result map.
 func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
 	gpuLines := make(map[int][]string)
 	for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
 		line = strings.TrimSpace(line)
 		if !strings.HasPrefix(line, "[gpu ") {
 			continue
 		}
 		end := strings.Index(line, "] ")
 		if end < 0 {
 			continue
 		}
 		gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
 		if err != nil {
 			continue
 		}
 		gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
 	}
 	results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
 	for gpuIdx, lines := range gpuLines {
 		// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
 		// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
 		results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
 	}
 	return results
 }
 // runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
 // simultaneously using a single bee-gpu-burn invocation per phase.
 func runNvidiaBenchmarkParallel(
 	ctx context.Context,
 	verboseLog, runDir string,
 	selected []int,
 	infoByIndex map[int]benchmarkGPUInfo,
 	opts NvidiaBenchmarkOptions,
 	spec benchmarkProfileSpec,
 	logFunc func(string),
 	result *NvidiaBenchmarkResult,
 	serverIdleW *float64, serverLoadedWSum *float64,
 	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
 ) {
 	allDevices := joinIndexList(selected)
 	// Build per-GPU result stubs.
 	gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
 	for _, idx := range selected {
 		r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
 		if info, ok := infoByIndex[idx]; ok {
 			r.UUID = info.UUID
 			r.Name = info.Name
 			r.BusID = info.BusID
 			r.VBIOS = info.VBIOS
 			r.PowerLimitW = info.PowerLimitW
 			r.MultiprocessorCount = info.MultiprocessorCount
 			r.DefaultPowerLimitW = info.DefaultPowerLimitW
 			r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
 			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
 		}
 		gpuResults[idx] = r
 	}
 	// Baseline: sample all GPUs together.
 	baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
 	if err != nil && err != context.Canceled {
 		for _, idx := range selected {
 			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
 		}
 	}
 	for _, idx := range selected {
 		perGPU := filterRowsByGPU(baselineRows, idx)
 		gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
 	}
 	// Sample server idle power once.
 	if !*serverIdleOK {
 		if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
 			*serverIdleW = w
 			*serverIdleOK = true
 			logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
 		}
 	}
 	// Warmup: all GPUs simultaneously.
 	warmupCmd := []string{
 		"bee-gpu-burn",
 		"--seconds", strconv.Itoa(spec.WarmupSec),
 		"--size-mb", strconv.Itoa(opts.SizeMB),
 		"--devices", allDevices,
 	}
 	logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
 	warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
 	for _, idx := range selected {
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
 	}
 	if warmupErr != nil {
 		for _, idx := range selected {
 			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
 		}
 	}
 	// Snapshot throttle counters before steady.
 	beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
 	for _, idx := range selected {
 		beforeThrottle[idx], _ = queryThrottleCounters(idx)
 	}
 	// Steady: all GPUs simultaneously.
 	steadyCmd := []string{
 		"bee-gpu-burn",
 		"--seconds", strconv.Itoa(spec.SteadySec),
 		"--size-mb", strconv.Itoa(opts.SizeMB),
 		"--devices", allDevices,
 	}
 	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
 	// Sample server power via IPMI in parallel with steady phase.
 	ipmiStopCh := make(chan struct{})
 	ipmiResultCh := make(chan float64, 1)
 	go func() {
 		defer close(ipmiResultCh)
 		var samples []float64
 		ticker := time.NewTicker(5 * time.Second)
 		defer ticker.Stop()
 		select {
 		case <-ipmiStopCh:
 			return
 		case <-time.After(15 * time.Second):
 		}
 		for {
 			if w, err := queryIPMIServerPowerW(); err == nil {
 				samples = append(samples, w)
 			}
 			select {
 			case <-ipmiStopCh:
 				if len(samples) > 0 {
 					var sum float64
 					for _, w := range samples {
 						sum += w
 					}
 					ipmiResultCh <- sum / float64(len(samples))
 				}
 				return
 			case <-ticker.C:
 			}
 		}
 	}()
 	steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
 	close(ipmiStopCh)
 	if loadedW, ok := <-ipmiResultCh; ok {
 		*serverLoadedWSum += loadedW
 		(*serverLoadedSamples)++
 		*serverLoadedOK = true
 		logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
 	}
 	_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
 	afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
 	for _, idx := range selected {
 		afterThrottle[idx], _ = queryThrottleCounters(idx)
 	}
 	parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
 	for _, idx := range selected {
 		perGPU := filterRowsByGPU(steadyRows, idx)
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
 		gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
 		gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
 		if pr, ok := parseResults[idx]; ok {
 			gpuResults[idx].ComputeCapability = pr.ComputeCapability
 			gpuResults[idx].Backend = pr.Backend
 			gpuResults[idx].PrecisionResults = pr.Profiles
 			if pr.Fallback {
 				gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
 			}
 		}
 		if steadyErr != nil {
 			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
 		}
 	}
 	// Cooldown: all GPUs together.
 	cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
 	if err != nil && err != context.Canceled {
 		for _, idx := range selected {
 			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
 		}
 	}
 	for _, idx := range selected {
 		perGPU := filterRowsByGPU(cooldownRows, idx)
 		gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
 	}
 	// Score and finalize each GPU.
 	for _, idx := range selected {
 		r := gpuResults[idx]
 		r.Scores = scoreBenchmarkGPUResult(*r)
 		r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
 		pr := parseResults[idx]
 		switch {
 		case steadyErr != nil:
 			r.Status = classifySATErrorStatus(steadyOut, steadyErr)
 		case pr.Fallback:
 			r.Status = "PARTIAL"
 		default:
 			r.Status = "OK"
 		}
 		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
 	}
 }
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -56,6 +56,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
 		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
 		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
 		if gpu.Scores.TOPSPerSMPerGHz > 0 {
 			fmt.Fprintf(&b, "  Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
 		}
 		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
 		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
 		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
@@ -77,13 +80,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 				}
 			}
 		}
-		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
+		fmt.Fprintf(&b, "  Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
 			gpu.Throttle.SWPowerCapUS,
 			gpu.Throttle.SWThermalSlowdownUS,
 			gpu.Throttle.SyncBoostUS,
 			gpu.Throttle.HWThermalSlowdownUS,
 			gpu.Throttle.HWPowerBrakeSlowdownUS,
 		)
 		if len(gpu.Notes) > 0 {
 			fmt.Fprintf(&b, "  Notes:\n")
 			for _, note := range gpu.Notes {
@@ -121,6 +118,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 		}
 	}
 	if sp := result.ServerPower; sp != nil {
 		fmt.Fprintf(&b, "Server Power (IPMI)\n")
 		fmt.Fprintf(&b, "-------------------\n")
 		if !sp.Available {
 			fmt.Fprintf(&b, "Unavailable\n")
 		} else {
 			fmt.Fprintf(&b, "  Server idle:         %.0f W\n", sp.IdleW)
 			fmt.Fprintf(&b, "  Server under load:   %.0f W\n", sp.LoadedW)
 			fmt.Fprintf(&b, "  Server delta:        %.0f W\n", sp.DeltaW)
 			fmt.Fprintf(&b, "  GPU reported (sum):  %.0f W\n", sp.GPUReportedSumW)
 			if sp.ReportingRatio > 0 {
 				fmt.Fprintf(&b, "  Reporting ratio:     %.2f  (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
 			}
 		}
 		for _, note := range sp.Notes {
 			fmt.Fprintf(&b, "  Note: %s\n", note)
 		}
 		b.WriteString("\n")
 	}
 	fmt.Fprintf(&b, "Methodology\n")
 	fmt.Fprintf(&b, "-----------\n")
 	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
@@ -175,6 +192,42 @@ func stripANSIEscapeSequences(raw string) string {
 	return ansiEscapePattern.ReplaceAllString(raw, "")
 }
 // formatThrottleLine renders throttle counters as human-readable percentages of
 // the steady-state window.  Only non-zero counters are shown.  When the steady
 // duration is unknown (0), raw seconds are shown instead.
 func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
 	type counter struct {
 		label string
 		us    uint64
 	}
 	counters := []counter{
 		{"sw_power", t.SWPowerCapUS},
 		{"sw_thermal", t.SWThermalSlowdownUS},
 		{"sync_boost", t.SyncBoostUS},
 		{"hw_thermal", t.HWThermalSlowdownUS},
 		{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
 	}
 	var parts []string
 	for _, c := range counters {
 		if c.us == 0 {
 			continue
 		}
 		sec := float64(c.us) / 1e6
 		if steadyDurationSec > 0 {
 			pct := sec / steadyDurationSec * 100
 			parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
 		} else if sec < 1 {
 			parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
 		} else {
 			parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
 		}
 	}
 	if len(parts) == 0 {
 		return "none"
 	}
 	return strings.Join(parts, "  ")
 }
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -14,13 +14,17 @@ type NvidiaBenchmarkOptions struct {
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
 	ParallelGPUs      bool // run all selected GPUs simultaneously instead of sequentially
 }
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion   string                       `json:"benchmark_version"`
 	GeneratedAt        time.Time                    `json:"generated_at"`
 	Hostname           string                       `json:"hostname,omitempty"`
 	ServerModel        string                       `json:"server_model,omitempty"`
 	BenchmarkProfile   string                       `json:"benchmark_profile"`
 	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`
@@ -28,6 +32,7 @@ type NvidiaBenchmarkResult struct {
 	Normalization      BenchmarkNormalization       `json:"normalization"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
 }
 type BenchmarkNormalization struct {
@@ -56,7 +61,10 @@ type BenchmarkGPUResult struct {
 	Backend                string                     `json:"backend,omitempty"`
 	Status                 string                     `json:"status"`
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
 	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
 	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
 	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
@@ -117,6 +125,24 @@ type BenchmarkScorecard struct {
 	StabilityScore      float64 `json:"stability_score"`
 	InterconnectScore   float64 `json:"interconnect_score"`
 	CompositeScore      float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
 	// Comparable across throttle levels and GPU generations. Low value at normal
 	// clocks indicates silicon degradation.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }
 // BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
 // power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
 // telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
 // over-reporting its power consumption.
 type BenchmarkServerPower struct {
 	Available       bool     `json:"available"`
 	IdleW           float64  `json:"idle_w,omitempty"`
 	LoadedW         float64  `json:"loaded_w,omitempty"`
 	DeltaW          float64  `json:"delta_w,omitempty"`
 	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
 	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
 	Notes           []string `json:"notes,omitempty"`
 }
 type BenchmarkInterconnectResult struct {
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -116,25 +116,47 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 	if err := ctx.Err(); err != nil {
 		return err
 	}
-	if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
+
-		log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
+	mediumRebound := false
 	if err := bindMount(dstDir, "/run/live/medium"); err != nil {
 		log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
 	} else {
 		mediumRebound = true
 	}
 	log("Verifying live medium now served from RAM...")
 	status := s.LiveBootSource()
-	if err := verifyInstallToRAMStatus(status); err != nil {
+	if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
 		return err
 	}
-	log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
+	if status.InRAM {
-	log("Done. Installation media can be safely disconnected.")
+		log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
 	}
 	log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
 	return nil
 }
-func verifyInstallToRAMStatus(status LiveBootSource) error {
+func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
 	if status.InRAM {
 		return nil
 	}
-	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
+
 	// The live medium mount was not redirected to RAM. This is expected when
 	// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
 	// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
 	// because the CD-ROM mount is in use. Check whether files were at least
 	// copied to the tmpfs directory — that is sufficient for safe disconnection
 	// once the kernel has paged in all actively-used data.
 	files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
 	if len(files) > 0 {
 		if !mediumRebound {
 			log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
 			log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
 		}
 		return nil
 	}
 	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
 }
 func describeLiveBootSource(status LiveBootSource) string {
@@ -247,7 +269,31 @@ func findLoopForFile(backingFile string) (string, error) {
 	return "", fmt.Errorf("no loop device found for %s", backingFile)
 }
 // loopDeviceOffset returns the byte offset configured for the loop device,
 // or -1 if it cannot be determined.
 func loopDeviceOffset(loopDev string) int64 {
 	out, err := exec.Command("losetup", "--json", loopDev).Output()
 	if err != nil {
 		return -1
 	}
 	var result struct {
 		Loopdevices []struct {
 			Offset int64 `json:"offset"`
 		} `json:"loopdevices"`
 	}
 	if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
 		return -1
 	}
 	return result.Loopdevices[0].Offset
 }
 func reassociateLoopDevice(loopDev, newFile string) error {
 	// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
 	// typically set up with a non-zero offset (squashfs lives inside the ISO),
 	// so the ioctl returns EINVAL. Detect this early for a clear error message.
 	if off := loopDeviceOffset(loopDev); off > 0 {
 		return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
 	}
 	if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
 		return nil
 	}
--- a/audit/internal/platform/install_to_ram_linux.go
+++ b/audit/internal/platform/install_to_ram_linux.go
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
 	}
 	return nil
 }
 // bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
 func bindMount(src, dst string) error {
 	return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
 }
--- a/audit/internal/platform/install_to_ram_other.go
+++ b/audit/internal/platform/install_to_ram_other.go
@@ -7,3 +7,7 @@ import "errors"
 func loopChangeFD(loopDev, newFile string) error {
 	return errors.New("LOOP_CHANGE_FD not available on this platform")
 }
 func bindMount(src, dst string) error {
 	return errors.New("bind mount not available on this platform")
 }
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -33,14 +33,17 @@ func TestInferLiveBootKind(t *testing.T) {
 func TestVerifyInstallToRAMStatus(t *testing.T) {
 	t.Parallel()
-	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
+	dstDir := t.TempDir()
 	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
 		t.Fatalf("expected success for RAM-backed status, got %v", err)
 	}
-	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
+
 	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
 	if err == nil {
 		t.Fatal("expected verification failure when media is still on USB")
 	}
-	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
+	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
 		t.Fatalf("error=%q", got)
 	}
 }
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -88,6 +88,37 @@ type NvidiaGPU struct {
 	MemoryMB int    `json:"memory_mb"`
 }
 type NvidiaGPUStatus struct {
 	Index        int    `json:"index"`
 	Name         string `json:"name"`
 	BDF          string `json:"bdf,omitempty"`
 	Serial       string `json:"serial,omitempty"`
 	Status       string `json:"status"`
 	RawLine      string `json:"raw_line,omitempty"`
 	NeedsReset   bool   `json:"needs_reset"`
 	ParseFailure bool   `json:"parse_failure,omitempty"`
 }
 type nvidiaGPUHealth struct {
 	Index        int
 	Name         string
 	NeedsReset   bool
 	RawLine      string
 	ParseFailure bool
 }
 type nvidiaGPUStatusFile struct {
 	Index       int
 	Name        string
 	RunStatus   string
 	Reason      string
 	Health      string
 	HealthRaw   string
 	Observed    bool
 	Selected    bool
 	FailingJob  string
 }
 // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
 type AMDGPUInfo struct {
 	Index int    `json:"index"`
@@ -269,6 +300,72 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
 	return gpus, nil
 }
 func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
 	out, err := satExecCommand(
 		"nvidia-smi",
 		"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
 		return nil, fmt.Errorf("nvidia-smi: %w", err)
 	}
 	var gpus []NvidiaGPUStatus
 	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
 		line = strings.TrimSpace(line)
 		if line == "" {
 			continue
 		}
 		parts := strings.Split(line, ",")
 		if len(parts) < 4 {
 			gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
 			continue
 		}
 		idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
 		if err != nil {
 			gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
 			continue
 		}
 		upper := strings.ToUpper(line)
 		needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
 		status := "OK"
 		if needsReset {
 			status = "RESET_REQUIRED"
 		}
 		gpus = append(gpus, NvidiaGPUStatus{
 			Index:      idx,
 			Name:       strings.TrimSpace(parts[1]),
 			BDF:        normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
 			Serial:     strings.TrimSpace(parts[3]),
 			Status:     status,
 			RawLine:    line,
 			NeedsReset: needsReset,
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
 	return gpus, nil
 }
 func normalizeNvidiaBusID(v string) string {
 	v = strings.TrimSpace(strings.ToLower(v))
 	parts := strings.Split(v, ":")
 	if len(parts) == 3 && len(parts[0]) > 4 {
 		parts[0] = parts[0][len(parts[0])-4:]
 		return strings.Join(parts, ":")
 	}
 	return v
 }
 func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	if index < 0 {
 		return "", fmt.Errorf("gpu index must be >= 0")
 	}
 	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
 	if len(raw) == 0 && err == nil {
 		raw = []byte("GPU reset completed.\n")
 	}
 	return string(raw), err
 }
 // RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
 func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
@@ -604,7 +701,7 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
 		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
+		satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
 	)
 }
@@ -652,11 +749,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	var summary strings.Builder
 	stats := satStats{}
 	nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
 	perGPU := map[int]*nvidiaGPUStatusFile{}
 	selectedGPUIndices := map[int]struct{}{}
 	fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	for _, job := range jobs {
 		if ctx.Err() != nil {
 			break
 		}
 		for _, idx := range job.gpuIndices {
 			selectedGPUIndices[idx] = struct{}{}
 			status := perGPU[idx]
 			if status == nil {
 				status = &nvidiaGPUStatusFile{Index: idx}
 				perGPU[idx] = status
 			}
 			status.Selected = true
 		}
 		cmd := make([]string, 0, len(job.cmd))
 		for _, arg := range job.cmd {
 			cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
@@ -665,10 +774,37 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		var out []byte
 		var err error
-		if job.collectGPU {
+		if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
-			out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
+			if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
-		} else {
+				if logFunc != nil {
-			out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
+					logFunc(msg)
 				}
 				out = []byte(msg + "\n")
 				err = healthErr
 			}
 		}
 		if err == nil {
 			if job.collectGPU {
 				out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
 			} else {
 				out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
 			}
 		}
 		if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
 			if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
 				if logFunc != nil {
 					logFunc(msg)
 				}
 				if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
 					out = append(out, '\n')
 				}
 				out = append(out, []byte(msg+"\n")...)
 				if err == nil {
 					err = healthErr
 				}
 			}
 		}
 		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
@@ -679,6 +815,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		}
 		status, rc := classifySATResult(job.name, out, err)
 		stats.Add(status)
 		if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
 			for _, idx := range job.gpuIndices {
 				updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
 			}
 		}
 		key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
 		fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
 		fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
@@ -687,6 +828,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
 	if nvidiaPack {
 		if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
 			return "", err
 		}
 	}
 	archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
 	if err := createTarGz(archive, runDir); err != nil {
@@ -695,6 +841,197 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	return archive, nil
 }
 func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
 	entry := perGPU[idx]
 	if entry == nil {
 		entry = &nvidiaGPUStatusFile{Index: idx}
 		perGPU[idx] = entry
 	}
 	if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
 		entry.RunStatus = status
 		entry.FailingJob = jobName
 		entry.Reason = firstLine(detail)
 	}
 }
 func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
 	health, err := readNvidiaGPUHealth()
 	if err == nil {
 		for _, gpu := range health {
 			entry := perGPU[gpu.Index]
 			if entry == nil {
 				entry = &nvidiaGPUStatusFile{Index: gpu.Index}
 				perGPU[gpu.Index] = entry
 			}
 			entry.Name = gpu.Name
 			entry.Observed = true
 			entry.HealthRaw = gpu.RawLine
 			if gpu.NeedsReset {
 				entry.Health = "RESET_REQUIRED"
 				if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
 					entry.RunStatus = "FAILED"
 					if strings.TrimSpace(entry.Reason) == "" {
 						entry.Reason = "GPU requires reset"
 					}
 				}
 			} else {
 				entry.Health = "OK"
 			}
 		}
 	}
 	for idx := range selected {
 		entry := perGPU[idx]
 		if entry == nil {
 			entry = &nvidiaGPUStatusFile{Index: idx}
 			perGPU[idx] = entry
 		}
 		entry.Selected = true
 	}
 	var indices []int
 	for idx := range perGPU {
 		indices = append(indices, idx)
 	}
 	sort.Ints(indices)
 	for _, idx := range indices {
 		entry := perGPU[idx]
 		if entry.RunStatus == "" {
 			entry.RunStatus = overall
 		}
 		if entry.Health == "" {
 			entry.Health = "UNKNOWN"
 		}
 		if entry.Name == "" {
 			entry.Name = "unknown"
 		}
 		var body strings.Builder
 		fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
 		fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
 		fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
 		fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
 		fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
 		fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
 		if strings.TrimSpace(entry.FailingJob) != "" {
 			fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
 		}
 		if strings.TrimSpace(entry.Reason) != "" {
 			fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
 		}
 		if strings.TrimSpace(entry.HealthRaw) != "" {
 			fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
 		}
 		if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
 			return err
 		}
 	}
 	return nil
 }
 func nvidiaSATStatusSeverity(status string) int {
 	switch strings.ToUpper(strings.TrimSpace(status)) {
 	case "FAILED":
 		return 3
 	case "PARTIAL", "UNSUPPORTED":
 		return 2
 	case "OK":
 		return 1
 	default:
 		return 0
 	}
 }
 func firstLine(s string) string {
 	s = strings.TrimSpace(s)
 	if s == "" {
 		return ""
 	}
 	if idx := strings.IndexByte(s, '\n'); idx >= 0 {
 		return strings.TrimSpace(s[:idx])
 	}
 	return s
 }
 func nvidiaJobNeedsHealthCheck(job satJob) bool {
 	if job.collectGPU {
 		return true
 	}
 	name := strings.ToLower(strings.TrimSpace(job.name))
 	return strings.Contains(name, "dcgmi") ||
 		strings.Contains(name, "gpu-burn") ||
 		strings.Contains(name, "gpu-stress") ||
 		strings.Contains(name, "dcgmproftester")
 }
 func checkNvidiaJobHealth(selected []int) (string, error) {
 	health, err := readNvidiaGPUHealth()
 	if err != nil {
 		return "", nil
 	}
 	var bad []nvidiaGPUHealth
 	selectedSet := make(map[int]struct{}, len(selected))
 	for _, idx := range selected {
 		selectedSet[idx] = struct{}{}
 	}
 	for _, gpu := range health {
 		if len(selectedSet) > 0 {
 			if _, ok := selectedSet[gpu.Index]; !ok {
 				continue
 			}
 		}
 		if gpu.NeedsReset {
 			bad = append(bad, gpu)
 		}
 	}
 	if len(bad) == 0 {
 		return "", nil
 	}
 	lines := make([]string, 0, len(bad)+1)
 	lines = append(lines, "NVIDIA GPU health check failed:")
 	for _, gpu := range bad {
 		lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
 	}
 	return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
 }
 func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
 	out, err := satExecCommand(
 		"nvidia-smi",
 		"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
 		return nil, fmt.Errorf("nvidia-smi: %w", err)
 	}
 	return parseNvidiaGPUHealth(string(out)), nil
 }
 func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
 	var gpus []nvidiaGPUHealth
 	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
 		line = strings.TrimSpace(line)
 		if line == "" {
 			continue
 		}
 		parts := strings.Split(line, ",")
 		if len(parts) < 2 {
 			gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
 			continue
 		}
 		idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
 		if err != nil {
 			gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
 			continue
 		}
 		upper := strings.ToUpper(line)
 		gpus = append(gpus, nvidiaGPUHealth{
 			Index:      idx,
 			Name:       strings.TrimSpace(parts[1]),
 			NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
 			RawLine:    line,
 		})
 	}
 	return gpus
 }
 func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
 	start := time.Now().UTC()
 	resolvedCmd, err := resolveSATCommand(cmd)
@@ -818,6 +1155,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 		// nvidia-smi on a machine with no NVIDIA GPU
 		strings.Contains(text, "couldn't communicate with the nvidia driver") ||
 		strings.Contains(text, "no nvidia gpu") ||
 		// Some NVMe firmwares start self-test but never expose progress to nvme-cli
 		// while waiting, so the CLI stops polling without proving device failure.
 		(strings.Contains(name, "self-test") &&
 			strings.Contains(text, "no progress for") &&
 			strings.Contains(text, "stop waiting")) ||
 		(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
 		return "UNSUPPORTED", rc
 	}
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -216,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
 	}
 }
 func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
 	t.Parallel()
 	got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
 	if len(got) != 2 {
 		t.Fatalf("len=%d want 2", len(got))
 	}
 	if got[0].NeedsReset {
 		t.Fatalf("gpu0 unexpectedly marked reset-required")
 	}
 	if !got[1].NeedsReset {
 		t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
 	}
 }
 func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
 	oldExecCommand := satExecCommand
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		if name == "nvidia-smi" {
 			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
 		}
 		return exec.Command(name, args...)
 	}
 	t.Cleanup(func() { satExecCommand = oldExecCommand })
 	msg, err := checkNvidiaJobHealth([]int{1})
 	if err == nil {
 		t.Fatal("expected health check error")
 	}
 	if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
 		t.Fatalf("unexpected message: %q", msg)
 	}
 }
 func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
 	dir := t.TempDir()
 	oldExecCommand := satExecCommand
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		if name == "nvidia-smi" {
 			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
 		}
 		return exec.Command(name, args...)
 	}
 	t.Cleanup(func() { satExecCommand = oldExecCommand })
 	perGPU := map[int]*nvidiaGPUStatusFile{
 		0: {Index: 0, RunStatus: "OK"},
 		1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
 	}
 	if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
 		t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
 	}
 	raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
 	if err != nil {
 		t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
 	}
 	text := string(raw)
 	if !strings.Contains(text, "run_status=FAILED") {
 		t.Fatalf("missing run status:\n%s", text)
 	}
 	if !strings.Contains(text, "health_status=RESET_REQUIRED") {
 		t.Fatalf("missing health status:\n%s", text)
 	}
 	if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
 		t.Fatalf("missing failing job:\n%s", text)
 	}
 }
 func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
 	oldLookPath := satLookPath
 	satLookPath = func(file string) (string, error) {
@@ -341,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
 	}{
 		{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
 		{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 		{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 		{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
 		{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 	}
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -28,6 +28,12 @@ var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
 	}
 	return a.ListNvidiaGPUs()
 }
 var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
 	if a == nil {
 		return nil, fmt.Errorf("app not configured")
 	}
 	return a.ListNvidiaGPUStatuses()
 }
 // ── Job ID counter ────────────────────────────────────────────────────────────
@@ -470,6 +476,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		GPUIndices        []int  `json:"gpu_indices"`
 		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
 		RunNCCL           *bool  `json:"run_nccl"`
 		ParallelGPUs      *bool  `json:"parallel_gpus"`
 		DisplayName       string `json:"display_name"`
 	}
 	if r.Body != nil {
@@ -483,6 +490,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 	if body.RunNCCL != nil {
 		runNCCL = *body.RunNCCL
 	}
 	parallelGPUs := false
 	if body.ParallelGPUs != nil {
 		parallelGPUs = *body.ParallelGPUs
 	}
 	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
 		name = body.DisplayName
@@ -493,6 +504,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		SizeMB:            body.SizeMB,
 		BenchmarkProfile:  body.Profile,
 		RunNCCL:           runNCCL,
 		ParallelGPUs:      parallelGPUs,
 		DisplayName:       body.DisplayName,
 	}, name, h.opts.App, "benchmark-nvidia")
 	if err != nil {
@@ -782,6 +794,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
 	writeJSON(w, gpus)
 }
 func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
 	gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
 	if err != nil {
 		writeError(w, http.StatusInternalServerError, err.Error())
 		return
 	}
 	if gpus == nil {
 		gpus = []platform.NvidiaGPUStatus{}
 	}
 	writeJSON(w, gpus)
 }
 func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
 	var req struct {
 		Index int `json:"index"`
 	}
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 		writeError(w, http.StatusBadRequest, "invalid request body")
 		return
 	}
 	result, err := h.opts.App.ResetNvidiaGPU(req.Index)
 	status := "ok"
 	if err != nil {
 		status = "error"
 	}
 	writeJSON(w, map[string]string{"status": status, "output": result.Body})
 }
 func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1070,14 +1070,24 @@ func renderValidate(opts HandlerOptions) string {
 		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">NVIDIA GPU Selection</div>
  <div class="card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
    </div>
    <div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
    </div>
    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
  </div>
 </div>
 <div class="grid3">
-` + renderSATCard("nvidia-selection", "NVIDIA GPU Selection", "", "", renderValidateCardBody(
+` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
 		inv.NVIDIA,
 		`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
 		`<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
 		`<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
 	)) +
 		renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs NVIDIA diagnostics and board inventory checks.`,
 			`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
@@ -1615,6 +1625,10 @@ func renderBenchmark(opts HandlerOptions) string {
          <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
        </div>
      </div>
      <label class="benchmark-cb-row">
        <input type="checkbox" id="benchmark-parallel-gpus">
        <span>Run all selected GPUs simultaneously (parallel mode)</span>
      </label>
      <label class="benchmark-cb-row">
        <input type="checkbox" id="benchmark-run-nccl" checked>
        <span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
@@ -1740,10 +1754,12 @@ function runNvidiaBenchmark() {
    return;
  }
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
  const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked;
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
    run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
    parallel_gpus: parallelGPUs,
    display_name: 'NVIDIA Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
@@ -1877,19 +1893,31 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 			displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
 			cells:       make(map[string]benchmarkHistoryCell),
 		}
 		// Count how many GPUs of each model appear in this run (for the label).
 		gpuModelCount := make(map[string]int)
 		for _, gpu := range result.GPUs {
-			key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
+			gpuModelCount[strings.TrimSpace(gpu.Name)]++
 		}
 		// Track best composite score per column key within this run.
 		runBest := make(map[string]float64)
 		for _, gpu := range result.GPUs {
 			key := benchmarkHistoryColumnKey(result.ServerModel, gpu.Name)
 			count := gpuModelCount[strings.TrimSpace(gpu.Name)]
 			columnByKey[key] = benchmarkHistoryColumn{
 				key:   key,
-				label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
+				label: benchmarkHistoryColumnLabel(result.ServerModel, gpu.Name, count),
 				name:  strings.TrimSpace(gpu.Name),
 				index: gpu.Index,
 			}
-			run.cells[key] = benchmarkHistoryCell{
+			if gpu.Scores.CompositeScore > runBest[key] {
-				score:   gpu.Scores.CompositeScore,
+				runBest[key] = gpu.Scores.CompositeScore
 				present: true,
 			}
 		}
 		for key, score := range runBest {
 			run.cells[key] = benchmarkHistoryCell{score: score, present: true}
 		}
 		runs = append(runs, run)
 	}
@@ -1898,13 +1926,10 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 		columns = append(columns, col)
 	}
 	sort.Slice(columns, func(i, j int) bool {
-		leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
+		li := strings.ToLower(columns[i].label)
-		rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
+		lj := strings.ToLower(columns[j].label)
-		if leftName != rightName {
+		if li != lj {
-			return leftName < rightName
+			return li < lj
 		}
 		if columns[i].index != columns[j].index {
 			return columns[i].index < columns[j].index
 		}
 		return columns[i].key < columns[j].key
 	})
@@ -1914,16 +1939,25 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 	return columns, runs
 }
-func benchmarkHistoryColumnKey(name string, index int) string {
+// benchmarkHistoryColumnKey groups results by server model + GPU model so that
-	return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
+// runs on the same hardware produce one column regardless of individual GPU index.
 func benchmarkHistoryColumnKey(serverModel, gpuName string) string {
 	return strings.TrimSpace(serverModel) + "|" + strings.TrimSpace(gpuName)
 }
-func benchmarkHistoryColumnLabel(name string, index int) string {
+// benchmarkHistoryColumnLabel formats the column header as
-	name = strings.TrimSpace(name)
+// "Server Model (N× GPU Model)" or "GPU Model" when server info is missing.
-	if name == "" {
+func benchmarkHistoryColumnLabel(serverModel, gpuName string, count int) string {
-		return fmt.Sprintf("GPU %d", index)
+	serverModel = strings.TrimSpace(serverModel)
 	gpuName = strings.TrimSpace(gpuName)
 	if gpuName == "" {
 		gpuName = "Unknown GPU"
 	}
-	return fmt.Sprintf("%s / GPU %d", name, index)
+	gpuPart := fmt.Sprintf("%d× %s", count, gpuName)
 	if serverModel == "" {
 		return gpuPart
 	}
 	return fmt.Sprintf("%s (%s)", serverModel, gpuPart)
 }
 // ── Burn ──────────────────────────────────────────────────────────────────────
@@ -2442,7 +2476,7 @@ func renderNetwork() string {
 func renderServicesInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
-<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
+<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
 <div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div id="svc-out" style="display:none;margin-top:12px">
  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
@@ -2513,11 +2547,6 @@ function svcAction(btn, name, action) {
      btn.disabled = false;
    });
 }
 function restartGPUDrivers() {
  var btn = document.querySelector('[onclick*="restartGPUDrivers"]');
  if (!btn) { svcAction({textContent:'',disabled:false}, 'bee-nvidia', 'restart'); return; }
  svcAction(btn, 'bee-nvidia', 'restart');
 }
 loadServices();
 </script>`
 }
@@ -2777,6 +2806,124 @@ loadDisplays();
 </script>`
 }
 func renderNvidiaSelfHealInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
 <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
  <button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
  <button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">&#8635; Refresh</button>
 </div>
 <div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
 <div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
    <span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
    <span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
  </div>
  <div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
 </div>
 <script>
 function nvidiaSelfHealShowResult(label, status, output) {
  var out = document.getElementById('nvidia-self-heal-out');
  var term = document.getElementById('nvidia-self-heal-terminal');
  var statusEl = document.getElementById('nvidia-self-heal-out-status');
  var labelEl = document.getElementById('nvidia-self-heal-out-label');
  out.style.display = 'block';
  labelEl.textContent = label;
  term.textContent = output || '(no output)';
  term.scrollTop = term.scrollHeight;
  if (status === 'ok') {
    statusEl.textContent = '✓ done';
    statusEl.style.color = 'var(--ok-fg, #2c662d)';
  } else {
    statusEl.textContent = '✗ failed';
    statusEl.style.color = 'var(--crit-fg, #9f3a38)';
  }
 }
 function nvidiaRestartDrivers() {
  var btn = document.getElementById('nvidia-restart-btn');
  var original = btn.textContent;
  btn.disabled = true;
  btn.textContent = 'Restarting...';
  nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
  fetch('/api/services/action', {
    method:'POST',
    headers:{'Content-Type':'application/json'},
    body:JSON.stringify({name:'bee-nvidia', action:'restart'})
  }).then(r=>r.json()).then(d => {
    nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
    setTimeout(function() {
      loadServices();
      loadNvidiaSelfHeal();
    }, 800);
  }).catch(e => {
    nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
  }).finally(() => {
    btn.disabled = false;
    btn.textContent = original;
  });
 }
 function nvidiaResetGPU(index, btn) {
  var original = btn.textContent;
  btn.disabled = true;
  btn.textContent = 'Resetting...';
  nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
  fetch('/api/gpu/nvidia-reset', {
    method:'POST',
    headers:{'Content-Type':'application/json'},
    body:JSON.stringify({index:index})
  }).then(r=>r.json()).then(d => {
    nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
    setTimeout(loadNvidiaSelfHeal, 1000);
  }).catch(e => {
    nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
  }).finally(() => {
    btn.disabled = false;
    btn.textContent = original;
  });
 }
 function loadNvidiaSelfHeal() {
  var status = document.getElementById('nvidia-self-heal-status');
  var table = document.getElementById('nvidia-self-heal-table');
  status.textContent = 'Loading NVIDIA GPU status...';
  status.style.color = 'var(--muted)';
  table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
  fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
    if (!Array.isArray(gpus) || gpus.length === 0) {
      status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
      table.innerHTML = '';
      return;
    }
    status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
    const rows = gpus.map(g => {
      const serial = g.serial || '';
      const bdf = g.bdf || '';
      const id = serial || bdf || ('gpu-' + g.index);
      const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
      const details = [];
      if (serial) details.push('serial ' + serial);
      if (bdf) details.push('bdf ' + bdf);
      if (g.parse_failure && g.raw_line) details.push(g.raw_line);
      return '<tr>'
        + '<td style="white-space:nowrap">' + g.index + '</td>'
        + '<td>' + (g.name || 'unknown') + '</td>'
        + '<td style="font-family:monospace">' + id + '</td>'
        + '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
        + (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
        + '</td>'
        + '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
        + '</tr>';
    }).join('');
    table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
  }).catch(e => {
    status.textContent = 'Error loading NVIDIA GPU status: ' + e;
    status.style.color = 'var(--crit-fg, #9f3a38)';
    table.innerHTML = '';
  });
 }
 loadNvidiaSelfHeal();
 </script>`
 }
 // ── Tools ─────────────────────────────────────────────────────────────────────
 func renderTools() string {
@@ -2837,6 +2984,9 @@ function installToRAM() {
 <div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div>
 <div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
 <div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
 		renderNvidiaSelfHealInline() + `</div></div>
 <div class="card"><div class="card-head">Network</div><div class="card-body">` +
 		renderNetworkInline() + `</div></div>
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -302,6 +302,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
 	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
 	mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
 	mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
 	mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
 	// System
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -591,7 +591,7 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
 	}
 }
-func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
+func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
@@ -599,11 +599,20 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `NVIDIA Self Heal`) {
 		t.Fatalf("tools page missing nvidia self heal section: %s", body)
 	}
 	if !strings.Contains(body, `Restart GPU Drivers`) {
 		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
 	}
-	if !strings.Contains(body, `restartGPUDrivers()`) {
+	if !strings.Contains(body, `nvidiaRestartDrivers()`) {
-		t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
+		t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
 	}
 	if !strings.Contains(body, `/api/gpu/nvidia-status`) {
 		t.Fatalf("tools page missing nvidia status api usage: %s", body)
 	}
 	if !strings.Contains(body, `nvidiaResetGPU(`) {
 		t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
 	}
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
@@ -711,6 +720,8 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 		`controlled NVIDIA DCGM load`,
 		`<code>dcgmi diag targeted_stress</code>`,
 		`NVIDIA GPU Selection`,
 		`All NVIDIA validate tasks use only the GPUs selected here.`,
 		`Select All`,
 		`id="sat-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -123,6 +123,7 @@ type taskParams struct {
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
@@ -585,6 +586,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RunNCCL:           t.params.RunNCCL,
 			ParallelGPUs:      t.params.ParallelGPUs,
 		}, j.append)
 	case "nvidia-compute":
 		if a == nil {
--- a/2
+++ b/2
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -0,0 +1,248 @@
 # Benchmark clock calibration research
 ## Status
 In progress. Baseline data from production servers pending.
 ## Background
 The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
 before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
 `avg_steady_clock < locked_target * 0.90`.
 Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
 even a healthy GPU in a non-ideal server will sustain clocks well below boost.
 The 90% threshold has no empirical basis.
 ## Key observations (2026-04-06)
 ### H100 PCIe — new card, server not designed for it
 - avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
 - Thermal sustain: 0.0 (sw_thermal covers entire steady window)
 - Stability: 70.0 — clocks erratic, no equilibrium found
 - Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
 ### H200 NVL — new card, server not designed for it
 - avg clock = P95 = 1635 MHz (perfectly stable)
 - Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
 - Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
 - Degradation: power_capped, thermal_limited
 - Compute: 989 TOPS — card is computing correctly for its frequency
 ### Key insight
 The meaningful distinction is not *whether* the card throttles but *how stably*
 it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
 H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
 instability may reflect a more severe thermal mismatch or a card issue.
 `sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
 `hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
 ## Hypothesis for baseline
 After testing on servers designed for their GPUs (proper cooling):
 - Healthy GPU under sustained load will run at a stable fraction of boost
 - Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
 - Base clock (`clocks.base.gr`) may be a better reference than boost:
  a healthy card under real workload should comfortably exceed base clock
 ## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
 Source: external stress test tool, ~90s runs, designed server, adequate power.
 ### Healthy fingerprint
 - **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
 - **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
  - Avg steady (visual): **~1580–1620 MHz**
  - vs boost 1755 MHz: **~91–92%**
  - Oscillation is NORMAL — this is the boost algorithm balancing under power cap
  - Stable power + oscillating clocks = healthy power-cap behavior
 - **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
 - **Consistency**: all 10 samples within ±20 MHz — very repeatable
 ### Characteristic patten
 Flat power line + oscillating/declining clock line = GPU correctly managed by
 power cap algorithm. Do NOT flag this as instability.
 ### Clock CV implication
 The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
 The current `variance_too_high` threshold (StabilityScore < 85) may fire on
 healthy HBM2e PCIe cards. Needs recalibration.
 ---
 ## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
 Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
 Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
 ### GPU clock reference (from nvidia-smi, idle):
 - base_clock_mhz: **1095**
 - boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
 - achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
 - Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
 ### Observed under 700W sustained load (both samples nearly identical):
 - Power: ~700W flat — SXM slot, adequate power confirmed
 - Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
 - vs 1980 MHz (lock target): **72–74%** — severely below
 - vs 1755 MHz (nvidia-smi boost): **81–83%**
 - vs 1095 MHz (base): 130% — above base but far below expected for SXM
 - Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
 - Temperature: 38°C → 79–80°C (same rate as HBM2e)
 - Oscillation: present, similar character to HBM2e but at much lower frequency
 ### Diagnosis
 These restored cards are degraded. A healthy H100 SXM in a designed server
 (DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
 The 72–74% result is a clear signal of silicon or VRM degradation from the
 refurbishment process.
 ### Clock pattern note
 Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
 to images 19/20. Both sample sets show same degraded pattern — same batch.
 ---
 ## Baseline matrix (filled where data available)
 | GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
 |---|---|---|---|---|---|
 | H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
 | H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
 | H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
 | H200 NVL | designed | TBD | TBD | TBD | need baseline |
 ---
 ## H100 official spec (from NVIDIA datasheet)
 Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
 All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
 | Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
 |---|---|---|---|---|---|
 | H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
 | H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
 | H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
 | H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
 Notes:
 - SXM boards do NOT list FP8 peak in this table (field empty)
 - fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
 - Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
 ## Observed efficiency (H100 80GB PCIe, throttled server)
 From the report in this session (power+thermal throttle throughout steady):
 | Precision | Measured | Spec (dense) | % of spec |
 |---|---|---|---|
 | fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
 | fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
 | fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
 33–44% of spec is expected given sustained power+thermal throttle (avg clock
 1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
 actual frequency — the low TOPS comes from throttle, not silicon defect.
 ## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
 Format: without sparsity / with sparsity.
 | Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
 |---|---|---|---|---|---|
 | H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
 | H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
 ## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
 Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
 | Precision | Measured | Spec (dense) | % of spec |
 |---|---|---|---|
 | fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
 | fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
 | fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
 Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
 both are throttle-limited. Confirms that % of spec is not a quality signal,
 it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
 ## Real-world GEMM efficiency reference (2026-04-06, web research)
 Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
 worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
 ### What healthy systems actually achieve:
 - H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
 - cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
 - H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
 ### Our results vs expectation:
 | GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
 |---|---|---|---|---|
 | H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
 | H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
 Our results are roughly **half** of what a healthy system achieves even under throttle.
 This is NOT normal — 30-44% is not the industry baseline.
 ### Likely causes of the gap (in order of probability):
 1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
 2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
   Previous user may have set a lower limit via nvidia-smi -pl and it was not
   reset. Our normalization sets clock locks but does NOT reset power limit.
   Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
 3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
   8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
 ### Power limit gap analysis (H100 PCIe):
 - Avg clock 1384 MHz = 79% of boost 1755 MHz
 - Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
 - Actually measured: 329 TOPS = 55% of that estimate
 - Remaining gap after accounting for clock throttle: ~45%
 - Most likely explanation: enforced power limit < 350W TDP, further reducing
  sustainable clock beyond what sw_thermal alone would cause.
 ### Action item:
 Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
 so result.json shows if the card was pre-configured with a non-default limit.
 If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
 ### CPU/RAM impact on GPU FLOPS:
 None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
 CPU core count and host RAM are irrelevant.
 ## Compute efficiency metric (proposed, no hardcode)
 Instead of comparing TOPS to a hardcoded spec, compute:
  tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
 This is model-agnostic. A GPU computing correctly at its actual frequency
 will show a consistent tops_per_sm_per_ghz regardless of throttle level.
 A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
 normal clocks.
 SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
 (needs to be added to queryBenchmarkGPUInfo).
 Reference values to establish after baseline runs:
 - H100 PCIe fp16_tensor: TBD tops/SM/GHz
 - H100 SXM fp16_tensor: TBD tops/SM/GHz
 ## Proposed threshold changes (pending more data)
 1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
   91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
   capture the root cause.
 2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
   under power cap. Consider suppressing this flag when power is flat and usage
   is 100% (oscillation is expected). Or lower threshold to 70.
 3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
   ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
   would have been caught by this).
 Decision deferred until baseline on SXM designed servers collected.
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -606,6 +606,20 @@ struct prepared_profile {
 };
 static const struct profile_desc k_profiles[] = {
    {
        "fp64",
        "fp64",
        80,
        1,
        0,
        0,
        8,
        CUDA_R_64F,
        CUDA_R_64F,
        CUDA_R_64F,
        CUDA_R_64F,
        CUBLAS_COMPUTE_64F,
    },
    {
        "fp32_tf32",
        "fp32",
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -5,69 +5,110 @@ echo "=== generating bee wallpaper ==="
 mkdir -p /usr/share/bee
 python3 - <<'PYEOF'
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image, ImageDraw, ImageFont, ImageFilter
 import os
 W, H = 1920, 1080
-LOGO = """\
+GLYPHS = {
-  \u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2557   \u2588\u2588\u2557      \u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557
+    'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"],
-  \u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u255a\u2588\u2588\u2557 \u2588\u2588\u2554\u255d      \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d
+    'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"],
-  \u2588\u2588\u2588\u2588\u2588\u2557  \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u255a\u2588\u2588\u2588\u2588\u2554\u255d \u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2557  \u2588\u2588\u2588\u2588\u2588\u2557
+    'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"],
-  \u2588\u2588\u2554\u2550\u2550\u255d  \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\u255a\u2550\u2550\u2550\u2550\u2588\u2588\u2551  \u255a\u2588\u2588\u2554\u255d  \u255a\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u255d  \u2588\u2588\u2554\u2550\u2550\u255d
+    'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"],
-  \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2551  \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551   \u2588\u2588\u2551         \u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557
+    'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"],
-  \u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u255d  \u255a\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d   \u255a\u2550\u255d         \u255a\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d
+    '-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"],
-  Hardware Audit LiveCD"""
+}
-# Find a monospace font that supports box-drawing characters
+TITLE = "EASY-BEE"
-FONT_CANDIDATES = [
+SUBTITLE = "Hardware Audit LiveCD"
-    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf',
+CELL = 30
-    '/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf',
+GLYPH_GAP = 18
-    '/usr/share/fonts/truetype/freefont/FreeMono.ttf',
+ROW_GAP = 6
-    '/usr/share/fonts/truetype/noto/NotoMono-Regular.ttf',
+
 FG = (0xF6, 0xD0, 0x47)
 FG_DIM = (0xD4, 0xA9, 0x1C)
 SHADOW = (0x5E, 0x47, 0x05)
 SUB = (0x96, 0x7A, 0x17)
 BG = (0x05, 0x05, 0x05)
 SUB_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
 ]
 font_path = None
 for p in FONT_CANDIDATES:
    if os.path.exists(p):
        font_path = p
        break
-SIZE = 22
+def load_font(size):
-if font_path:
+    for path in SUB_FONT_CANDIDATES:
-    font_logo = ImageFont.truetype(font_path, SIZE)
+        if os.path.exists(path):
-    font_sub  = ImageFont.truetype(font_path, SIZE)
+            return ImageFont.truetype(path, size)
-else:
+    return ImageFont.load_default()
    font_logo = ImageFont.load_default()
    font_sub  = font_logo
-img  = Image.new('RGB', (W, H), (0, 0, 0))
+
 def glyph_width(ch):
    return len(GLYPHS[ch][0])
 def render_logo_mask():
    width_cells = 0
    for idx, ch in enumerate(TITLE):
        width_cells += glyph_width(ch)
        if idx != len(TITLE) - 1:
            width_cells += 1
    mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
    mask_h = 7 * CELL + 6 * ROW_GAP
    mask = Image.new('L', (mask_w, mask_h), 0)
    draw = ImageDraw.Draw(mask)
    cx = 0
    for idx, ch in enumerate(TITLE):
        glyph = GLYPHS[ch]
        for row_idx, row in enumerate(glyph):
            for col_idx, cell in enumerate(row):
                if cell != '1':
                    continue
                x0 = cx + col_idx * CELL
                y0 = row_idx * (CELL + ROW_GAP)
                x1 = x0 + CELL - 4
                y1 = y0 + CELL - 4
                draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
        cx += glyph_width(ch) * CELL
        if idx != len(TITLE) - 1:
            cx += CELL + GLYPH_GAP
    return mask
 img = Image.new('RGB', (W, H), BG)
 draw = ImageDraw.Draw(img)
-# Measure logo block line by line to avoid font ascender offset
+# Soft amber glow under the logo without depending on font rendering.
-lines = LOGO.split('\n')
+glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
-logo_lines = lines[:6]
+glow_draw = ImageDraw.Draw(glow)
-sub_line   = lines[6] if len(lines) > 6 else ''
+glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
 glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
 glow = glow.filter(ImageFilter.GaussianBlur(60))
 img = Image.alpha_composite(img.convert('RGBA'), glow)
-line_h = SIZE + 2
+logo_mask = render_logo_mask()
-block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)
+logo_w, logo_h = logo_mask.size
 logo_x = (W - logo_w) // 2
 logo_y = 290
-# Width: measure the widest logo line
+shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
-max_w = 0
+img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
-for line in logo_lines:
+img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
-    bb = draw.textbbox((0, 0), line, font=font_logo)
+img.paste(FG, (logo_x, logo_y), logo_mask)
    max_w = max(max_w, bb[2] - bb[0])
-x = (W - max_w) // 2
+font_sub = load_font(30)
-y = (H - block_h) // 2
+sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
 sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
 sub_y = logo_y + logo_h + 54
 draw = ImageDraw.Draw(img)
 draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
 draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
-cy = y
+img = img.convert('RGB')
 for line in logo_lines:
    draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
    cy += line_h
 cy += 8
 if sub_line:
    draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))
 img.save('/usr/share/bee/wallpaper.png', optimize=True)
 print('wallpaper written: /usr/share/bee/wallpaper.png')
--- a/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
+++ b/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
@@ -0,0 +1,41 @@
 #!/bin/sh
 # 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
 #
 # live-boot tries "losetup --replace --direct-io=on" when re-associating the
 # loop device to the RAM copy in /dev/shm.  tmpfs does not support O_DIRECT,
 # so the ioctl returns EINVAL and the verification step fails.
 #
 # The patch replaces the replace call so that if --direct-io=on fails it falls
 # back to a plain replace without direct-io, and also relaxes the verification
 # to a warning so the boot continues even when re-association is imperfect.
 set -e
 TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
 if [ ! -f "${TORAM_SCRIPT}" ]; then
    echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
    exit 0
 fi
 echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
 # Replace any losetup --replace call that includes --direct-io=on with a
 # version that first tries with direct-io, then retries without it.
 #
 # The sed expression turns:
 #   losetup --replace ... --direct-io=on LOOP FILE
 # into a shell snippet that tries both, silently.
 #
 # We also downgrade the fatal "Task finished with error." block to a warning
 # so the boot continues if re-association fails (squashfs still accessible).
 # 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
 sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
 sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
 # 2. Turn the hard error into a warning so boot continues.
 #    live-boot prints this exact string when verification fails.
 sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
 echo "9010-fix-toram: patch applied"
 grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
Author	SHA1	Message	Date
Mikhail Chusavitin	0a4bb596f6	Improve install-to-RAM verification for ISO boots	2026-04-07 20:21:06 +03:00
Mikhail Chusavitin	531d1ca366	Add NVIDIA self-heal tools and per-GPU SAT status	2026-04-07 20:20:05 +03:00
Mikhail Chusavitin	93cfa78e8c	Benchmark: parallel GPU mode, resilient inventory query, server model in results - Add parallel GPU mode (checkbox, off by default): runs all selected GPUs simultaneously via a single bee-gpu-burn invocation instead of sequentially; per-GPU telemetry, throttle counters, TOPS, and scoring are preserved - Make queryBenchmarkGPUInfo resilient: falls back to a base field set when extended fields (attribute.multiprocessor_count, power.default_limit) cause exit status 2, preventing lgc normalization from being silently skipped - Log explicit "graphics clock lock skipped" note when inventory is unavailable - Collect server model from DMI (/sys/class/dmi/id/product_name) and store in result JSON; benchmark history columns now show "Server Model (N× GPU Model)" grouped by server+GPU type rather than individual GPU index Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-07 18:32:15 +03:00
Mikhail Chusavitin	1358485f2b	fix logo wallpaper	2026-04-07 10:15:38 +03:00
Michael Chus	8fe20ba678	Fix benchmark scoring: PowerSustain uses default power limit PowerSustainScore now uses DefaultPowerLimitW as reference so a manually reduced power limit does not inflate the score. Falls back to enforced limit if default is unavailable. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 22:30:59 +03:00
Michael Chus	d973231f37	Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check - Sample server power (IPMI dcmi) during baseline+steady phases in parallel; compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting - Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w from nvidia-smi alongside existing GPU info - Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal) - Flag when enforced power limit is below default TDP by >5% - Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0) - Improve Executive Summary: overall pass count, FAILED GPU finding - Throttle counters now shown as % of steady window instead of raw microseconds - bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 22:26:52 +03:00
Michael Chus	f5d175f488	Fix toram: patch live-boot to not use O_DIRECT when replacing loop to tmpfs losetup --replace --direct-io=on fails with EINVAL when the target file is on tmpfs (/dev/shm), because tmpfs does not support O_DIRECT. Strip the --direct-io flag from the replace call and downgrade the verification failure to a warning so boot continues. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 21:06:21 +03:00
Michael Chus	fa00667750	Refactor NVIDIA GPU Selection into standalone card on validate page Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 21:06:16 +03:00