Fix USB/RAM status checks; add server model+S/N to dashboard; remove cycles

USB Export Drive: lsblk reports TRAN only for whole disks, not partitions (/dev/sdc1). Strip trailing partition digits to get parent disk before transport check. LiveCD in RAM: When RunInstallToRAM copies squashfs to /dev/shm/bee-live/ but bind-mount of /run/live/medium fails (CD-ROM boots), /run/live/medium still shows the CD-ROM fstype. Add fallback: if /dev/shm/bee-live/*.squashfs exists, the data is in RAM — report status OK. Dashboard Hardware Summary: Show server Manufacturer + ProductName as heading and S/N as subline above the component table, sourced from hw.Board (dmidecode system-type data). Validate: Remove Cycles input — always run once. cycles=1 hardcoded in runAllSAT(). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Include profile and mode in benchmark task names for task list clarity
2026-04-12 22:46:42 +03:00 · 2026-04-12 22:36:51 +03:00 · 2026-04-12 22:33:17 +03:00 · 2026-04-12 22:30:47 +03:00 · 2026-04-12 22:17:56 +03:00 · 2026-04-12 22:06:46 +03:00
50 changed files with 2292 additions and 1146 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -117,7 +117,7 @@ type satRunner interface {
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
-	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
@@ -139,7 +139,6 @@ type satRunner interface {
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
 	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error)
 }
 type runtimeChecker interface {
@@ -191,6 +190,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 	}
 	result := collector.Run(runtimeMode)
 	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
 	writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -567,11 +567,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }
-func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
 }
 func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -738,13 +738,6 @@ func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
 	return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
 }
 func (a *App) RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error) {
 	if a == nil {
 		return "", nil, fmt.Errorf("app not configured")
 	}
 	return a.sat.RunHPL(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
 	path, err := a.RunFanStressTest(ctx, "", opts)
 	body := formatFanStressResult(path)
@@ -934,6 +927,41 @@ func bodyOr(body, fallback string) string {
 	return body
 }
 // writePSUStatusesToDB records PSU statuses collected during audit into the
 // component-status DB so they are visible in the Hardware Summary card.
 // PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
 func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
 	if db == nil || len(psus) == 0 {
 		return
 	}
 	const source = "audit:ipmi"
 	worstStatus := "OK"
 	for _, psu := range psus {
 		if psu.Status == nil {
 			continue
 		}
 		slot := "?"
 		if psu.Slot != nil {
 			slot = *psu.Slot
 		}
 		st := *psu.Status
 		detail := ""
 		if psu.ErrorDescription != nil {
 			detail = *psu.ErrorDescription
 		}
 		db.Record("psu:"+slot, source, st, detail)
 		switch st {
 		case "Critical":
 			worstStatus = "Critical"
 		case "Warning":
 			if worstStatus != "Critical" {
 				worstStatus = "Warning"
 			}
 		}
 	}
 	db.Record("psu:all", source, worstStatus, "")
 }
 func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
 	return f.runNvidiaFn(baseDir)
 }
-func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
 	if f.runNvidiaComputeFn != nil {
 		return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
 	}
@@ -282,9 +282,6 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
 	return "", nil
 }
 func (f fakeSAT) RunHPL(_ context.Context, _ string, _ platform.HPLOptions, _ func(string)) (string, *platform.HPLResult, error) {
 	return "", nil, nil
 }
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()
@@ -545,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
 }
 func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 	t.Parallel()
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -583,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 }
 func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
 	t.Parallel()
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -646,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 }
 func TestRunSATDefaultsToExportDir(t *testing.T) {
 	t.Parallel()
 	oldSATBaseDir := DefaultSATBaseDir
 	DefaultSATBaseDir = "/tmp/export/bee-sat"
 	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -54,7 +54,7 @@ if ! command -v lspci >/dev/null 2>&1; then
  exit 0
 fi
 found=0
-for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
+	for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
  found=1
  echo "=== GPU $gpu ==="
  lspci -s "$gpu" -vv 2>&1 || true
@@ -74,6 +74,11 @@ fi
 for d in /sys/bus/pci/devices/*/; do
  vendor=$(cat "$d/vendor" 2>/dev/null)
 	  [ "$vendor" = "0x10de" ] || continue
 	  class=$(cat "$d/class" 2>/dev/null)
 	  case "$class" in
 	    0x030000|0x030200) ;;
 	    *) continue ;;
 	  esac
 	  dev=$(basename "$d")
  echo "=== $dev ==="
  for f in current_link_speed current_link_width max_link_speed max_link_width; do
@@ -192,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
 }
-const supportBundleGlob = "bee-support-*.tar.gz"
+const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
 func BuildSupportBundle(exportDir string) (string, error) {
 	exportDir = strings.TrimSpace(exportDir)
@@ -206,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}
-	host := sanitizeFilename(hostnameOr("unknown"))
+	now := time.Now().UTC()
-	ts := time.Now().UTC().Format("20060102-150405")
+	date := now.Format("2006-01-02")
-	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
+	tod := now.Format("150405")
 	ver := bundleVersion()
 	model := serverModelForBundle()
 	sn := serverSerialForBundle()
 	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
 		return "", err
 	}
@@ -240,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}
-	archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
+	archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
 	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
 	}
@@ -397,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	return os.WriteFile(dst, []byte(body.String()), 0644)
 }
 func bundleVersion() string {
 	v := buildVersion()
 	v = strings.TrimPrefix(v, "v")
 	v = strings.TrimPrefix(v, "V")
 	if v == "" || v == "unknown" {
 		return "0.0"
 	}
 	return v
 }
 func serverModelForBundle() string {
 	raw, err := exec.Command("dmidecode", "-t", "1").Output()
 	if err != nil {
 		return "unknown"
 	}
 	for _, line := range strings.Split(string(raw), "\n") {
 		line = strings.TrimSpace(line)
 		key, val, ok := strings.Cut(line, ": ")
 		if !ok {
 			continue
 		}
 		if strings.TrimSpace(key) == "Product Name" {
 			val = strings.TrimSpace(val)
 			if val == "" {
 				return "unknown"
 			}
 			return strings.ReplaceAll(val, " ", "_")
 		}
 	}
 	return "unknown"
 }
 func serverSerialForBundle() string {
 	raw, err := exec.Command("dmidecode", "-t", "1").Output()
 	if err != nil {
 		return "unknown"
 	}
 	for _, line := range strings.Split(string(raw), "\n") {
 		line = strings.TrimSpace(line)
 		key, val, ok := strings.Cut(line, ": ")
 		if !ok {
 			continue
 		}
 		if strings.TrimSpace(key) == "Serial Number" {
 			val = strings.TrimSpace(val)
 			if val == "" {
 				return "unknown"
 			}
 			return val
 		}
 	}
 	return "unknown"
 }
 func buildVersion() string {
 	raw, err := exec.Command("bee", "version").CombinedOutput()
 	if err != nil {
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
 	defer cancel()
 	return exec.CommandContext(ctx, name, args...).Output()
 }
 func interfaceHasCarrier(iface string) bool {
 	raw, err := readNetCarrierFile(iface)
 	if err != nil {
 		return false
 	}
 	return strings.TrimSpace(raw) == "1"
 }
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -58,14 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}
 		if interfaceHasCarrier(iface) {
 		if out, err := ethtoolModuleQuery(iface); err == nil {
 			if injectSFPDOMTelemetry(&devs[i], out) {
 				enriched++
 				continue
 			}
 		}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
 			enriched++
 		}
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 		}
 		key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
 		val := strings.TrimSpace(trimmed[idx+1:])
 		if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
 			continue
 		}
 		switch {
 		case key == "identifier":
 			s := parseSFPIdentifier(val)
 			dev.SFPIdentifier = &s
 			t := true
 			dev.SFPPresent = &t
 			changed = true
 		case key == "connector":
 			s := parseSFPConnector(val)
 			dev.SFPConnector = &s
 			changed = true
 		case key == "vendor name":
 			s := strings.TrimSpace(val)
 			dev.SFPVendor = &s
 			changed = true
 		case key == "vendor pn":
 			s := strings.TrimSpace(val)
 			dev.SFPPartNumber = &s
 			changed = true
 		case key == "vendor sn":
 			s := strings.TrimSpace(val)
 			dev.SFPSerialNumber = &s
 			changed = true
 		case strings.Contains(key, "laser wavelength"):
 			if f, ok := firstFloat(val); ok {
 				dev.SFPWavelengthNM = &f
 				changed = true
 			}
 		case strings.Contains(key, "module temperature"):
 			if f, ok := firstFloat(val); ok {
 				dev.SFPTemperatureC = &f
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 	return changed
 }
 // parseSFPIdentifier extracts the human-readable transceiver type from the
 // raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
 func parseSFPIdentifier(val string) string {
 	if s := extractParens(val); s != "" {
 		return s
 	}
 	return val
 }
 // parseSFPConnector extracts the connector type from the raw ethtool line,
 // e.g. "0x07 (LC)" → "LC".
 func parseSFPConnector(val string) string {
 	if s := extractParens(val); s != "" {
 		return s
 	}
 	return val
 }
 var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
 func extractParens(s string) string {
 	m := parenRe.FindStringSubmatch(s)
 	if len(m) < 2 {
 		return ""
 	}
 	return strings.TrimSpace(m[1])
 }
 func parseSFPDOM(raw string) map[string]any {
 	dev := schema.HardwarePCIeDevice{}
 	if !injectSFPDOMTelemetry(&dev, raw) {
 		return map[string]any{}
 	}
 	out := map[string]any{}
 	if dev.SFPPresent != nil {
 		out["sfp_present"] = *dev.SFPPresent
 	}
 	if dev.SFPIdentifier != nil {
 		out["sfp_identifier"] = *dev.SFPIdentifier
 	}
 	if dev.SFPConnector != nil {
 		out["sfp_connector"] = *dev.SFPConnector
 	}
 	if dev.SFPVendor != nil {
 		out["sfp_vendor"] = *dev.SFPVendor
 	}
 	if dev.SFPPartNumber != nil {
 		out["sfp_part_number"] = *dev.SFPPartNumber
 	}
 	if dev.SFPSerialNumber != nil {
 		out["sfp_serial_number"] = *dev.SFPSerialNumber
 	}
 	if dev.SFPWavelengthNM != nil {
 		out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
 	}
 	if dev.SFPTemperatureC != nil {
 		out["sfp_temperature_c"] = *dev.SFPTemperatureC
 	}
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
 	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
 	readNetCarrierFile = func(string) (string, error) { return "0", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
-	ethtoolModuleQuery = func(string) (string, error) {
+	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
 		t.Fatal("ethtool -m should not be called without carrier")
 		return "", nil
 	}
 	class := "EthernetController"
 	bdf := "0000:18:00.0"
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -15,6 +15,7 @@ const nvidiaVendorID = 0x10de
 type nvidiaGPUInfo struct {
 	Index              int
 	BDF                string
 	Name               string
 	Serial             string
 	VBIOS              string
 	TemperatureC       *float64
@@ -73,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 			continue
 		}
 		if v := strings.TrimSpace(info.Name); v != "" {
 			devs[i].Model = &v
 		}
 		if v := strings.TrimSpace(info.Serial); v != "" {
 			devs[i].SerialNumber = &v
 		}
@@ -99,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
 	out, err := exec.Command(
 		"nvidia-smi",
-		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
+		"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
@@ -123,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		if len(rec) == 0 {
 			continue
 		}
-		if len(rec) < 13 {
+		if len(rec) < 14 {
-			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
+			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
 		}
 		bdf := normalizePCIeBDF(rec[1])
@@ -135,17 +139,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		info := nvidiaGPUInfo{
 			Index:              parseRequiredInt(rec[0]),
 			BDF:                bdf,
-			Serial:             strings.TrimSpace(rec[2]),
+			Name:               strings.TrimSpace(rec[2]),
-			VBIOS:              strings.TrimSpace(rec[3]),
+			Serial:             strings.TrimSpace(rec[3]),
-			TemperatureC:       parseMaybeFloat(rec[4]),
+			VBIOS:              strings.TrimSpace(rec[4]),
-			PowerW:             parseMaybeFloat(rec[5]),
+			TemperatureC:       parseMaybeFloat(rec[5]),
-			ECCUncorrected:     parseMaybeInt64(rec[6]),
+			PowerW:             parseMaybeFloat(rec[6]),
-			ECCCorrected:       parseMaybeInt64(rec[7]),
+			ECCUncorrected:     parseMaybeInt64(rec[7]),
-			HWSlowdown:         parseMaybeBool(rec[8]),
+			ECCCorrected:       parseMaybeInt64(rec[8]),
-			PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
+			HWSlowdown:         parseMaybeBool(rec[9]),
-			PCIeLinkGenMax:     parseMaybeInt(rec[10]),
+			PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
-			PCIeLinkWidthCur:   parseMaybeInt(rec[11]),
+			PCIeLinkGenMax:     parseMaybeInt(rec[11]),
-			PCIeLinkWidthMax:   parseMaybeInt(rec[12]),
+			PCIeLinkWidthCur:   parseMaybeInt(rec[12]),
 			PCIeLinkWidthMax:   parseMaybeInt(rec[13]),
 		}
 		result[bdf] = info
 	}
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -6,7 +6,7 @@ import (
 )
 func TestParseNVIDIASMIQuery(t *testing.T) {
-	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
+	raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
 	byBDF, err := parseNVIDIASMIQuery(raw)
 	if err != nil {
 		t.Fatalf("parse failed: %v", err)
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
 	if !ok {
 		t.Fatalf("gpu by normalized bdf not found")
 	}
 	if gpu.Name != "NVIDIA H100 80GB HBM3" {
 		t.Fatalf("name: got %q", gpu.Name)
 	}
 	if gpu.Serial != "GPU-SERIAL-1" {
 		t.Fatalf("serial: got %q", gpu.Serial)
 	}
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -2,6 +2,7 @@ package collector
 import (
 	"bee/audit/internal/schema"
 	"fmt"
 	"log/slog"
 	"os/exec"
 	"strconv"
@@ -79,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
 		}
 	}
 	// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
 	// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
 	if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
 		bmcPatterns := []string{
 			"management system chip",
 			"management controller",
 			"ibmc",
 			"idrac",
 			"ilo vga",
 			"aspeed",
 			"matrox",
 		}
 		for _, bad := range bmcPatterns {
 			if strings.Contains(d, bad) {
 				return false
 			}
 		}
 	}
 	if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
 		internalAMDPatterns := []string{
 			"dummy function",
@@ -153,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
 	// SVendor/SDevice available but not in schema — skip
 	// Warn if PCIe link is running below its maximum negotiated speed.
 	applyPCIeLinkSpeedWarning(&dev)
 	return dev
 }
@@ -222,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
 	return value, true
 }
 // applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
 // speed is below the maximum negotiated speed supported by both ends.
 func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
 	if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
 		return
 	}
 	if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
 		warn := statusWarning
 		dev.Status = &warn
 		desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
 		dev.ErrorDescription = &desc
 	}
 }
 // pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
 // Returns 0 for unrecognised values so comparisons fail safe.
 func pcieLinkSpeedRank(gen string) int {
 	switch gen {
 	case "Gen1":
 		return 1
 	case "Gen2":
 		return 2
 	case "Gen3":
 		return 3
 	case "Gen4":
 		return 4
 	case "Gen5":
 		return 5
 	case "Gen6":
 		return 6
 	default:
 		return 0
 	}
 }
 func normalizePCILinkSpeed(raw string) string {
 	raw = strings.TrimSpace(strings.ToLower(raw))
 	switch {
--- a/audit/internal/collector/pcie_filter_test.go
+++ b/audit/internal/collector/pcie_filter_test.go
@@ -1,6 +1,7 @@
 package collector
 import (
 	"bee/audit/internal/schema"
 	"encoding/json"
 	"strings"
 	"testing"
@@ -29,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
 		{name: "raid", class: "RAID bus controller", want: true},
 		{name: "nvme", class: "Non-Volatile memory controller", want: true},
 		{name: "vga", class: "VGA compatible controller", want: true},
 		{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
 		{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
 		{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
 	}
@@ -139,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
 		}
 	}
 }
 func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
 	ptr := func(s string) *string { return &s }
 	tests := []struct {
 		name        string
 		linkSpeed   *string
 		maxSpeed    *string
 		wantWarning bool
 		wantGenIn   string // substring expected in ErrorDescription when warning
 	}{
 		{
 			name:        "degraded Gen1 vs Gen5",
 			linkSpeed:   ptr("Gen1"),
 			maxSpeed:    ptr("Gen5"),
 			wantWarning: true,
 			wantGenIn:   "Gen1",
 		},
 		{
 			name:        "at max Gen5",
 			linkSpeed:   ptr("Gen5"),
 			maxSpeed:    ptr("Gen5"),
 			wantWarning: false,
 		},
 		{
 			name:        "degraded Gen4 vs Gen5",
 			linkSpeed:   ptr("Gen4"),
 			maxSpeed:    ptr("Gen5"),
 			wantWarning: true,
 			wantGenIn:   "Gen4",
 		},
 		{
 			name:        "missing current speed — no warning",
 			linkSpeed:   nil,
 			maxSpeed:    ptr("Gen5"),
 			wantWarning: false,
 		},
 		{
 			name:        "missing max speed — no warning",
 			linkSpeed:   ptr("Gen1"),
 			maxSpeed:    nil,
 			wantWarning: false,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			dev := schema.HardwarePCIeDevice{}
 			ok := statusOK
 			dev.Status = &ok
 			dev.LinkSpeed = tt.linkSpeed
 			dev.MaxLinkSpeed = tt.maxSpeed
 			applyPCIeLinkSpeedWarning(&dev)
 			gotWarn := dev.Status != nil && *dev.Status == statusWarning
 			if gotWarn != tt.wantWarning {
 				t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
 			}
 			if tt.wantWarning {
 				if dev.ErrorDescription == nil {
 					t.Fatal("expected ErrorDescription to be set")
 				}
 				if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
 					t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
 				}
 			} else {
 				if dev.ErrorDescription != nil {
 					t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
 				}
 			}
 		})
 	}
 }
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"regexp"
 	"sort"
@@ -108,7 +109,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		ServerModel:        readServerModel(),
 		BenchmarkProfile:   spec.Name,
 		ParallelGPUs:       opts.ParallelGPUs,
 		RampStep:           opts.RampStep,
 		RampTotal:          opts.RampTotal,
 		RampRunID:          opts.RampRunID,
 		SelectedGPUIndices: append([]int(nil), selected...),
 		HostConfig:         readBenchmarkHostConfig(),
 		Normalization: BenchmarkNormalization{
 			Status: "full",
 		},
@@ -121,15 +126,22 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	var serverIdleOK, serverLoadedOK bool
 	var serverLoadedSamples int
 	// Run nvidia-smi -q first: used both for the log file and as a fallback
 	// source of max clock values when CSV clock fields are unsupported.
 	var nvsmiQOut []byte
 	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
 		nvsmiQOut = out
 		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
 	}
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
 		result.Normalization.Status = "partial"
 	}
-
+	// Enrich with max clocks from verbose output — covers GPUs where
-	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
+	// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
-		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
+	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
 	}
 	activeApps, err := queryActiveComputeApps(selected)
 	if err == nil && len(activeApps) > 0 {
@@ -145,8 +157,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()
 	// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
 	// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
 	calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
 	// Start background CPU load sampler — samples every 10s during GPU phases.
 	cpuStopCh := make(chan struct{})
 	cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
 	if opts.ParallelGPUs {
-		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
+		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
 	} else {
 	for _, idx := range selected {
@@ -166,6 +186,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
 			gpuResult.CalibratedPeakPowerW = w
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -303,6 +326,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}
 	// Stop CPU load sampler and attach results.
 	close(cpuStopCh)
 	if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
 		result.CPULoad = summarizeCPULoad(cpuSamples)
 		if result.CPULoad != nil && result.CPULoad.Status != "ok" {
 			logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
 				result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
 		}
 	}
 	// Compute server power characterization from accumulated IPMI samples.
 	var gpuReportedSumW float64
 	for _, gpu := range result.GPUs {
@@ -314,6 +347,20 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	}
 	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
 	// Apply server-power penalty when IPMI reports the server delta is much
 	// lower than GPU-reported sum: GPU power telemetry is over-stated, making
 	// CalibratedPeakPowerW and PowerSustainScore unreliable.
 	// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
 	if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
 		factor := sp.ReportingRatio / 0.75
 		for i := range result.GPUs {
 			result.GPUs[i].Scores.CompositeScore *= factor
 			result.GPUs[i].Notes = append(result.GPUs[i].Notes,
 				fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
 					sp.ReportingRatio, factor*100))
 		}
 	}
 	result.Findings = buildBenchmarkFindings(result)
 	result.OverallStatus = benchmarkOverallStatus(result)
@@ -335,11 +382,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		return "", fmt.Errorf("write summary.txt: %w", err)
 	}
-	archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", fmt.Errorf("pack benchmark archive: %w", err)
 	}
 	return archive, nil
 }
 func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
@@ -374,9 +417,13 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 // Fields are tried in order; the first successful query wins. Extended fields
 // (attribute.multiprocessor_count, power.default_limit) are not supported on
 // all driver versions, so we fall back to the base set if the full query fails.
 // The minimal fallback omits clock fields entirely — clocks.max.* returns
 // exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
 // then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
 var benchmarkGPUInfoQueries = []struct {
 	fields   string
 	extended bool // whether this query includes optional extended fields
 	minimal  bool // clock fields omitted; max clocks must be filled separately
 }{
 	{
 		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
@@ -386,6 +433,104 @@ var benchmarkGPUInfoQueries = []struct {
 		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
 		extended: false,
 	},
 	{
 		fields:  "index,uuid,name,pci.bus_id,vbios_version,power.limit",
 		minimal: true,
 	},
 }
 // enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
 // any GPU in infoByIndex where those values are still zero.  It parses the
 // "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
 // This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
 // return exit status 2 but the verbose query works fine.
 func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
 	if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
 		return
 	}
 	// Build bus_id → index map for matching verbose sections to GPU indices.
 	busToBenchIdx := make(map[string]int, len(infoByIndex))
 	for idx, info := range infoByIndex {
 		if info.BusID != "" {
 			// nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain),
 			// while --query-gpu returns the same format; normalise to lower.
 			busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx
 		}
 	}
 	// Split the verbose output into per-GPU sections on "^GPU " lines.
 	gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
 	maxGfxRe      := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
 	maxMemRe      := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
 	defaultPwrRe  := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
 	currentPwrRe  := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
 	smCountRe     := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
 	sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
 	for i, loc := range sectionStarts {
 		busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]]))
 		benchIdx, ok := busToBenchIdx[busID]
 		if !ok {
 			// Bus IDs from verbose output may have a different domain prefix;
 			// try suffix match on the slot portion (XX:XX.X).
 			for k, v := range busToBenchIdx {
 				if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) {
 					benchIdx = v
 					ok = true
 					break
 				}
 			}
 		}
 		if !ok {
 			continue
 		}
 		end := len(nvsmiQ)
 		if i+1 < len(sectionStarts) {
 			end = sectionStarts[i+1][0]
 		}
 		section := nvsmiQ[loc[0]:end]
 		info := infoByIndex[benchIdx]
 		if info.MaxGraphicsClockMHz == 0 {
 			if m := maxGfxRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
 					info.MaxGraphicsClockMHz = v
 				}
 			}
 		}
 		if info.MaxMemoryClockMHz == 0 {
 			if m := maxMemRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
 					info.MaxMemoryClockMHz = v
 				}
 			}
 		}
 		if info.DefaultPowerLimitW == 0 {
 			if m := defaultPwrRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
 					info.DefaultPowerLimitW = v
 				}
 			}
 		}
 		if info.PowerLimitW == 0 {
 			if m := currentPwrRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
 					info.PowerLimitW = v
 				}
 			}
 		}
 		if info.MultiprocessorCount == 0 {
 			if m := smCountRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
 					info.MultiprocessorCount = v
 				}
 			}
 		}
 		infoByIndex[benchIdx] = info
 	}
 }
 func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
@@ -413,9 +558,13 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
 			continue
 		}
 		minFields := 6
 		if !q.minimal {
 			minFields = 9
 		}
 		infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
 		for _, row := range rows {
-			if len(row) < 9 {
+			if len(row) < minFields {
 				continue
 			}
 			idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
@@ -429,9 +578,10 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
 				BusID:       strings.TrimSpace(row[3]),
 				VBIOS:       strings.TrimSpace(row[4]),
 				PowerLimitW: parseBenchmarkFloat(row[5]),
 				MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
 				MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
 			}
 			if !q.minimal {
 				info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6])
 				info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7])
 				if len(row) >= 9 {
 					info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
 				}
@@ -443,6 +593,7 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
 						info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
 					}
 				}
 			}
 			infoByIndex[idx] = info
 		}
 		return infoByIndex, nil
@@ -744,14 +895,22 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 			score.ComputeScore += precision.TeraOpsPerSec
 		}
 	}
-	// Use default power limit for sustain score so a manually reduced limit
+	// PowerSustainScore: measures how close the GPU came to its rated TDP under
-	// does not inflate the score. Fall back to enforced limit if default unknown.
+	// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
-	referencePowerW := gpu.DefaultPowerLimitW
+	// Penalty applied symmetrically for both under- and over-TDP deviations:
-	if referencePowerW <= 0 {
+	//   score = max(0, 100 − |measured − rated| / rated × 100)
-		referencePowerW = gpu.PowerLimitW
+	// Under-TDP → power delivery / cooling issue.
 	// Over-TDP  → power limit not properly enforced / power regulation fault.
 	// Falls back to 0 if calibration was not performed (dcgmi unavailable).
 	{
 		ref := gpu.DefaultPowerLimitW
 		if ref <= 0 {
 			ref = gpu.PowerLimitW
 		}
 		if gpu.CalibratedPeakPowerW > 0 && ref > 0 {
 			deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100
 			score.PowerSustainScore = clampScore(100 - deviationPct)
 		}
 	if referencePowerW > 0 {
 		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
 	}
 	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
 	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
@@ -765,7 +924,15 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 }
 func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
-	quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
+	// Weights after introducing calibrated power reference:
 	//   base        0.35 — floor so a GPU that fails all sustain checks still scores
 	//   thermal     0.25 — heaviest: throttle counters are the most reliable signal
 	//   stability   0.25 — clock/power variance matters for reproducibility
 	//   power       0.15 — GPU reaches rated TDP under targeted_power? lower weight
 	//                       because calibration may be absent (dcgmi not installed)
 	//   NCCL bonus  0.10 — interconnect health
 	//   cap         1.10
 	quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
 	if score.InterconnectScore > 0 {
 		quality += 0.10
 	}
@@ -985,16 +1152,57 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 				gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
 			))
 		}
 		// Flag significant TDP deviation (over or under) from calibration.
 		if gpu.CalibratedPeakPowerW > 0 {
 			ref := gpu.DefaultPowerLimitW
 			if ref <= 0 {
 				ref = gpu.PowerLimitW
 			}
 			if ref > 0 {
 				deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100
 				switch {
 				case deviationPct < -10:
 					findings = append(findings, fmt.Sprintf(
 						"GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.",
 						gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref,
 					))
 				case deviationPct > 5:
 					findings = append(findings, fmt.Sprintf(
 						"GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.",
 						gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct,
 					))
 				}
 			}
 		}
 	}
 	if result.Interconnect != nil && result.Interconnect.Supported {
 		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
 	}
 	if cl := result.CPULoad; cl != nil {
 		switch cl.Status {
 		case "high":
 			findings = append(findings, fmt.Sprintf(
 				"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
 				cl.AvgPct, cl.MaxPct,
 			))
 		case "unstable":
 			findings = append(findings, fmt.Sprintf(
 				"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
 				cl.AvgPct, cl.P95Pct,
 			))
 		}
 	}
 	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
 		if sp.ReportingRatio < 0.75 {
 			findings = append(findings, fmt.Sprintf(
-				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
+				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
 				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
 			))
 		} else if sp.ReportingRatio > 1.25 {
 			findings = append(findings, fmt.Sprintf(
 				"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
 				sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
 			))
 		}
 	}
 	return dedupeStrings(findings)
@@ -1299,6 +1507,7 @@ func runNvidiaBenchmarkParallel(
 	spec benchmarkProfileSpec,
 	logFunc func(string),
 	result *NvidiaBenchmarkResult,
 	calibPowerByIndex map[int]float64,
 	serverIdleW *float64, serverLoadedWSum *float64,
 	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
 ) {
@@ -1320,6 +1529,9 @@ func runNvidiaBenchmarkParallel(
 			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
 			r.CalibratedPeakPowerW = w
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -1481,3 +1693,225 @@ func runNvidiaBenchmarkParallel(
 		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
 	}
 }
 // readBenchmarkHostConfig reads static CPU and memory configuration from
 // /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
 func readBenchmarkHostConfig() *BenchmarkHostConfig {
 	cfg := &BenchmarkHostConfig{}
 	populated := false
 	// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
 	if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
 		socketIDs := map[string]struct{}{}
 		coresPerSocket := map[string]int{}
 		var modelName string
 		threads := 0
 		for _, line := range strings.Split(string(data), "\n") {
 			kv := strings.SplitN(line, ":", 2)
 			if len(kv) != 2 {
 				continue
 			}
 			key := strings.TrimSpace(kv[0])
 			val := strings.TrimSpace(kv[1])
 			switch key {
 			case "processor":
 				threads++
 			case "model name":
 				if modelName == "" {
 					modelName = val
 				}
 			case "physical id":
 				socketIDs[val] = struct{}{}
 			case "cpu cores":
 				// Overwrite per-socket core count (last wins per socket, but all
 				// entries for the same socket report the same value).
 				if physLine := ""; physLine == "" {
 					// We accumulate below by treating cpu cores as a per-thread
 					// field; sum by socket requires a two-pass approach. Use the
 					// simpler approximation: totalCores = threads / (threads per core).
 					_ = val
 				}
 			}
 		}
 		// Second pass: per-socket core count.
 		var curSocket string
 		for _, line := range strings.Split(string(data), "\n") {
 			kv := strings.SplitN(line, ":", 2)
 			if len(kv) != 2 {
 				continue
 			}
 			key := strings.TrimSpace(kv[0])
 			val := strings.TrimSpace(kv[1])
 			switch key {
 			case "physical id":
 				curSocket = val
 			case "cpu cores":
 				if curSocket != "" {
 					if _, seen := coresPerSocket[curSocket]; !seen {
 						v, _ := strconv.Atoi(val)
 						coresPerSocket[curSocket] = v
 					}
 				}
 			}
 		}
 		totalCores := 0
 		for _, c := range coresPerSocket {
 			totalCores += c
 		}
 		cfg.CPUModel = modelName
 		cfg.CPUSockets = len(socketIDs)
 		if cfg.CPUSockets == 0 && threads > 0 {
 			cfg.CPUSockets = 1
 		}
 		cfg.CPUCores = totalCores
 		cfg.CPUThreads = threads
 		if modelName != "" || threads > 0 {
 			populated = true
 		}
 	}
 	// Parse /proc/meminfo for total physical RAM.
 	if data, err := os.ReadFile("/proc/meminfo"); err == nil {
 		for _, line := range strings.Split(string(data), "\n") {
 			if strings.HasPrefix(line, "MemTotal:") {
 				fields := strings.Fields(line)
 				if len(fields) >= 2 {
 					kb, _ := strconv.ParseUint(fields[1], 10, 64)
 					cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
 					populated = true
 				}
 				break
 			}
 		}
 	}
 	if !populated {
 		return nil
 	}
 	return cfg
 }
 // startCPULoadSampler starts a goroutine that samples host CPU load every
 // intervalSec seconds until stopCh is closed, then sends the collected
 // samples on the returned channel.
 func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
 	ch := make(chan []float64, 1)
 	go func() {
 		var samples []float64
 		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
 		defer ticker.Stop()
 		for {
 			select {
 			case <-stopCh:
 				ch <- samples
 				return
 			case <-ticker.C:
 				if pct := sampleCPULoadPct(); pct > 0 {
 					samples = append(samples, pct)
 				}
 			}
 		}
 	}()
 	return ch
 }
 // summarizeCPULoad computes stats over sampled CPU load values and assigns
 // a health status.
 func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
 	if len(samples) == 0 {
 		return nil
 	}
 	sorted := append([]float64(nil), samples...)
 	sort.Float64s(sorted)
 	var sum float64
 	for _, v := range sorted {
 		sum += v
 	}
 	avg := sum / float64(len(sorted))
 	p95 := sorted[int(float64(len(sorted))*0.95)]
 	max := sorted[len(sorted)-1]
 	cl := &BenchmarkCPULoad{
 		AvgPct:  math.Round(avg*10) / 10,
 		MaxPct:  math.Round(max*10) / 10,
 		P95Pct:  math.Round(p95*10) / 10,
 		Samples: len(sorted),
 	}
 	// Compute standard deviation to detect instability.
 	var variance float64
 	for _, v := range sorted {
 		d := v - avg
 		variance += d * d
 	}
 	stdDev := math.Sqrt(variance / float64(len(sorted)))
 	switch {
 	case avg > 20 || max > 40:
 		cl.Status = "high"
 		cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
 	case stdDev > 12:
 		cl.Status = "unstable"
 		cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
 	default:
 		cl.Status = "ok"
 	}
 	return cl
 }
 // runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
 // collecting nvidia-smi power samples in parallel. It returns a map from GPU
 // index to p95 observed power (watts), which is used as the reference for
 // PowerSustainScore instead of the hardware default limit.
 //
 // If dcgmi is unavailable or the run fails the function returns an empty map
 // and the caller falls back to DefaultPowerLimitW. The calibration is skipped
 // gracefully — it must never block or fail the main benchmark.
 func runBenchmarkPowerCalibration(
 	ctx context.Context,
 	verboseLog, runDir string,
 	gpuIndices []int,
 	logFunc func(string),
 ) map[int]float64 {
 	const calibDurationSec = 45
 	// dcgmi must be present.
 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
 		return map[int]float64{}
 	}
 	logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
 	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
 	out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
 	if err != nil {
 		logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
 		return map[int]float64{}
 	}
 	// Group rows by GPU index and compute p95 power for each.
 	result := make(map[int]float64, len(gpuIndices))
 	for _, idx := range gpuIndices {
 		perGPU := filterRowsByGPU(rows, idx)
 		if len(perGPU) == 0 {
 			continue
 		}
 		powers := make([]float64, 0, len(perGPU))
 		for _, r := range perGPU {
 			if r.PowerW > 0 {
 				powers = append(powers, r.PowerW)
 			}
 		}
 		if len(powers) == 0 {
 			continue
 		}
 		p95 := benchmarkPercentile(powers, 95)
 		if p95 > 0 {
 			result[idx] = p95
 			logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
 		}
 	}
 	return result
 }
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -60,9 +60,17 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
-	if result.ParallelGPUs {
+	if result.RampStep > 0 && result.RampTotal > 0 {
 		fmt.Fprintf(&b, "**Ramp-up step:** %d of %d  \n", result.RampStep, result.RampTotal)
 		if result.RampRunID != "" {
 			fmt.Fprintf(&b, "**Ramp-up run ID:** %s  \n", result.RampRunID)
 		}
 	} else if result.ParallelGPUs {
 		fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously)  \n")
 	}
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")
@@ -90,7 +98,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 	for _, gpu := range result.GPUs {
 		name := strings.TrimSpace(gpu.Name)
 		if name == "" {
-			name = "Unknown"
+			name = "Unknown GPU"
 		}
 		interconnect := "-"
 		if gpu.Scores.InterconnectScore > 0 {
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -178,3 +178,67 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
 		t.Fatalf("report should not contain ANSI escapes\n%s", report)
 	}
 }
 func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
    Max Clocks
        Graphics                          : 2430 MHz
        SM                                : 2430 MHz
        Memory                            : 12481 MHz
        Video                             : 2107 MHz
 GPU 00000000:4F:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Max Clocks
        Graphics                          : 2430 MHz
        Memory                            : 12481 MHz
 `)
 	infoByIndex := map[int]benchmarkGPUInfo{
 		0: {Index: 0, BusID: "00000000:4E:00.0"},
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}
 	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
 	}
 	if infoByIndex[0].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
 	}
 	if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
 	}
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
 }
 func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
 		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
 	}
 	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -2,6 +2,29 @@ package platform
 import "time"
 // BenchmarkHostConfig holds static CPU and memory configuration captured at
 // benchmark start. Useful for correlating results across runs on different hardware.
 type BenchmarkHostConfig struct {
 	CPUModel    string  `json:"cpu_model,omitempty"`
 	CPUSockets  int     `json:"cpu_sockets,omitempty"`
 	CPUCores    int     `json:"cpu_cores,omitempty"`
 	CPUThreads  int     `json:"cpu_threads,omitempty"`
 	MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
 }
 // BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
 // steady-state phase. High or unstable CPU load during a GPU benchmark may
 // indicate a competing workload or a CPU-bound driver bottleneck.
 type BenchmarkCPULoad struct {
 	AvgPct  float64 `json:"avg_pct"`
 	MaxPct  float64 `json:"max_pct"`
 	P95Pct  float64 `json:"p95_pct"`
 	Samples int     `json:"samples"`
 	// Status is "ok", "high", or "unstable".
 	Status string `json:"status"`
 	Note   string `json:"note,omitempty"`
 }
 const (
 	NvidiaBenchmarkProfileStandard  = "standard"
 	NvidiaBenchmarkProfileStability = "stability"
@@ -15,6 +38,9 @@ type NvidiaBenchmarkOptions struct {
 	ExcludeGPUIndices []int
 	RunNCCL           bool
 	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
 	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
 	RampTotal         int    // total number of ramp-up steps in this run
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }
@@ -25,11 +51,17 @@ type NvidiaBenchmarkResult struct {
 	ServerModel        string                       `json:"server_model,omitempty"`
 	BenchmarkProfile   string                       `json:"benchmark_profile"`
 	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
 	RampStep           int                          `json:"ramp_step,omitempty"`
 	RampTotal          int                          `json:"ramp_total,omitempty"`
 	RampRunID          string                       `json:"ramp_run_id,omitempty"`
 	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`
 	Warnings           []string                     `json:"warnings,omitempty"`
 	Normalization      BenchmarkNormalization       `json:"normalization"`
 	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
 	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
@@ -63,6 +95,11 @@ type BenchmarkGPUResult struct {
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
 	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
 	// the hardware default limit, which bee-gpu-burn cannot reach.
 	CalibratedPeakPowerW   float64                    `json:"calibrated_peak_power_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 }
 const (
-	ansiRed    = "\033[31m"
+	ansiAmber  = "\033[38;5;214m"
 	ansiBlue   = "\033[34m"
 	ansiGreen  = "\033[32m"
 	ansiYellow = "\033[33m"
 	ansiReset  = "\033[0m"
 )
@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
 		fn      func(GPUMetricRow) float64
 	}
 	defs := []seriesDef{
-		{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
+		{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
-		{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
+		{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
-		{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
+		{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
-		{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
+		{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
 	}
 	var b strings.Builder
--- a/audit/internal/platform/hpl.go
+++ b/audit/internal/platform/hpl.go
@@ -1,142 +0,0 @@
 package platform
 import (
 	"context"
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 )
 // HPLOptions configures the HPL (LINPACK) benchmark run.
 type HPLOptions struct {
 	MemFraction float64 // fraction of RAM to use (default 0.80)
 	NB          int     // block size (default 256)
 }
 // HPLResult holds the parsed result of an HPL run.
 type HPLResult struct {
 	N          int     // matrix dimension
 	NB         int     // block size
 	P          int     // process grid rows
 	Q          int     // process grid cols
 	TimeSec    float64 // wall time in seconds
 	GFlops     float64 // achieved performance
 	Residual   float64 // backward error residual (from HPL verification line)
 	Status     string  // "PASSED" or "FAILED"
 	RawOutput  string  // full xhpl output
 }
 func applyHPLDefaults(opts *HPLOptions) {
 	if opts.MemFraction <= 0 || opts.MemFraction > 1 {
 		opts.MemFraction = 0.80
 	}
 	if opts.NB <= 0 {
 		opts.NB = 256
 	}
 }
 // RunHPL runs bee-hpl and returns parsed results plus a tar.gz artifact path.
 func (s *System) RunHPL(ctx context.Context, baseDir string, opts HPLOptions, logFunc func(string)) (string, *HPLResult, error) {
 	applyHPLDefaults(&opts)
 	if baseDir == "" {
 		baseDir = "/var/log/bee-sat"
 	}
 	ts := time.Now().UTC().Format("20060102-150405")
 	runDir := filepath.Join(baseDir, "hpl-"+ts)
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		return "", nil, fmt.Errorf("mkdir %s: %w", runDir, err)
 	}
 	logPath := filepath.Join(runDir, "hpl.log")
 	cmd := []string{
 		"bee-hpl",
 		"--mem-fraction", strconv.FormatFloat(opts.MemFraction, 'f', 2, 64),
 		"--nb", strconv.Itoa(opts.NB),
 	}
 	if logFunc != nil {
 		logFunc(fmt.Sprintf("HPL: N will be auto-sized to %.0f%% of RAM, NB=%d", opts.MemFraction*100, opts.NB))
 	}
 	out, err := runSATCommandCtx(ctx, "", "hpl", cmd, nil, logFunc)
 	_ = os.WriteFile(logPath, out, 0644)
 	result := parseHPLOutput(string(out))
 	result.RawOutput = string(out)
 	if err != nil && err != context.Canceled {
 		return "", result, fmt.Errorf("bee-hpl failed: %w", err)
 	}
 	if err == nil && result.GFlops <= 0 {
 		return "", result, fmt.Errorf("HPL completed but no Gflops result found in output")
 	}
 	// Write summary
 	summary := fmt.Sprintf("N=%d NB=%d time=%.2fs gflops=%.3f status=%s\n",
 		result.N, result.NB, result.TimeSec, result.GFlops, result.Status)
 	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
 	if logFunc != nil {
 		logFunc(fmt.Sprintf("HPL result: N=%d NB=%d %.2fs %.3f Gflops %s",
 			result.N, result.NB, result.TimeSec, result.GFlops, result.Status))
 	}
 	ts2 := time.Now().UTC().Format("20060102-150405")
 	archive := filepath.Join(baseDir, "hpl-"+ts2+".tar.gz")
 	if archErr := createTarGz(archive, runDir); archErr != nil {
 		return runDir, result, err
 	}
 	return archive, result, err
 }
 // parseHPLOutput extracts N, NB, time, and Gflops from standard HPL output.
 //
 // HPL prints a result line of the form:
 //
 //	WR00L2L2       45312   256     1     1        1234.56             5.678e+01
 //	T/V               N    NB     P     Q           Time                 Gflops
 func parseHPLOutput(output string) *HPLResult {
 	result := &HPLResult{Status: "FAILED"}
 	for _, line := range strings.Split(output, "\n") {
 		line = strings.TrimSpace(line)
 		// Result line starts with WR
 		if strings.HasPrefix(line, "WR") {
 			fields := strings.Fields(line)
 			// WR00L2L2  N  NB  P  Q  Time  Gflops
 			if len(fields) >= 7 {
 				result.N, _ = strconv.Atoi(fields[1])
 				result.NB, _ = strconv.Atoi(fields[2])
 				result.P, _ = strconv.Atoi(fields[3])
 				result.Q, _ = strconv.Atoi(fields[4])
 				result.TimeSec, _ = strconv.ParseFloat(fields[5], 64)
 				result.GFlops, _ = strconv.ParseFloat(fields[6], 64)
 			}
 		}
 		// Verification line: "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ... PASSED"
 		if strings.Contains(line, "PASSED") {
 			result.Status = "PASSED"
 			fields := strings.Fields(line)
 			for i, f := range fields {
 				if f == "PASSED" && i > 0 {
 					result.Residual, _ = strconv.ParseFloat(fields[i-1], 64)
 				}
 			}
 		}
 	}
 	return result
 }
 // hplAvailable returns true if bee-hpl and xhpl are present and executable.
 func hplAvailable() bool {
 	if _, err := exec.LookPath("bee-hpl"); err != nil {
 		return false
 	}
 	_, err := os.Stat("/usr/local/lib/bee/xhpl")
 	return err == nil
 }
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -14,9 +14,17 @@ import (
 func (s *System) IsLiveMediaInRAM() bool {
 	fsType := mountFSType("/run/live/medium")
 	if fsType == "" {
 		// No medium mount at all — fall back to toram kernel parameter.
 		return toramActive()
 	}
-	return strings.EqualFold(fsType, "tmpfs")
+	if strings.EqualFold(fsType, "tmpfs") {
 		return true
 	}
 	// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
 	// mount of /run/live/medium fails (common for CD-ROM boots), the medium
 	// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
 	files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
 	return len(files) > 0
 }
 func (s *System) LiveBootSource() LiveBootSource {
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"--seconds", strconv.Itoa(opts.DurationSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
 		if opts.StaggerSeconds > 0 && len(selected) > 1 {
 			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
 		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"bee-john-gpu-stress",
 			"--seconds", strconv.Itoa(opts.DurationSec),
 		}
 		if opts.StaggerSeconds > 0 && len(selected) > 1 {
 			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
 		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
 	}
 	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
-	// Pack tar.gz
+	return runDir, nil
 	archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
 	if err := packPlatformDir(runDir, archivePath); err != nil {
 		return "", fmt.Errorf("pack archive: %w", err)
 	}
 	_ = os.RemoveAll(runDir)
 	return archivePath, nil
 }
 // collectPhase samples live metrics every second until ctx is done.
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -1,6 +1,7 @@
 package platform
 import (
 	"bufio"
 	"os"
 	"os/exec"
 	"strings"
@@ -114,6 +115,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 	}
 	s.collectGPURuntimeHealth(vendor, &health)
 	s.collectToRAMHealth(&health)
 	s.collectUSBExportHealth(&health)
 	if health.Status != "FAILED" && len(health.Issues) > 0 {
 		health.Status = "PARTIAL"
@@ -168,6 +171,96 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
 	return ToolStatus{Name: display}
 }
 // collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
 // Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
 // "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
 func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
 	inRAM := s.IsLiveMediaInRAM()
 	active := toramActive()
 	switch {
 	case inRAM:
 		health.ToRAMStatus = "ok"
 	case active:
 		// toram was requested but medium is not yet/no longer in RAM
 		health.ToRAMStatus = "failed"
 		health.Issues = append(health.Issues, schema.RuntimeIssue{
 			Code:        "toram_copy_failed",
 			Severity:    "warning",
 			Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
 		})
 	default:
 		health.ToRAMStatus = "warning"
 	}
 }
 // collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
 // suitable for log export. Sets USBExportPath to the first match found.
 func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
 	health.USBExportPath = findUSBExportMount()
 }
 // findUSBExportMount returns the mount point of the first writable USB filesystem
 // found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
 // has USB transport. Returns "" if none found.
 func findUSBExportMount() string {
 	f, err := os.Open("/proc/mounts")
 	if err != nil {
 		return ""
 	}
 	defer f.Close()
 	// fs types that are expected on USB export drives
 	exportFSTypes := map[string]bool{
 		"vfat":  true,
 		"exfat": true,
 		"ext2":  true,
 		"ext3":  true,
 		"ext4":  true,
 		"ntfs":  true,
 		"ntfs3": true,
 		"fuseblk": true,
 	}
 	scanner := bufio.NewScanner(f)
 	for scanner.Scan() {
 		// fields: device mountpoint fstype options dump pass
 		fields := strings.Fields(scanner.Text())
 		if len(fields) < 4 {
 			continue
 		}
 		device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
 		if !exportFSTypes[strings.ToLower(fsType)] {
 			continue
 		}
 		// Skip read-only mounts
 		opts := strings.Split(options, ",")
 		readOnly := false
 		for _, o := range opts {
 			if strings.TrimSpace(o) == "ro" {
 				readOnly = true
 				break
 			}
 		}
 		if readOnly {
 			continue
 		}
 		// Check USB transport via lsblk on the device (or its parent disk for partitions).
 		if !strings.HasPrefix(device, "/dev/") {
 			continue
 		}
 		checkDev := device
 		// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
 		// Strip trailing partition digits to get the parent disk name.
 		if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
 			checkDev = trimmed
 		}
 		if blockDeviceTransport(checkDev) == "usb" {
 			return mountPoint
 		}
 	}
 	return ""
 }
 func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
 	lsmodText := commandText("lsmod")
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -384,22 +384,36 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 	), logFunc)
 }
-func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
-	profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
+	var (
 		profCmd []string
 		profEnv []string
 	)
 	if staggerSec > 0 && len(selected) > 1 {
 		profCmd = []string{
 			"bee-dcgmproftester-staggered",
 			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
 			"--stagger-seconds", strconv.Itoa(staggerSec),
 			"--devices", joinIndexList(selected),
 		}
 	} else {
 		profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
 		if err != nil {
 			return "", err
 		}
 		profEnv = nvidiaVisibleDevicesEnv(selected)
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
 			satJob{
 				name:       "03-dcgmproftester.log",
 				cmd:        profCmd,
-			env:        nvidiaVisibleDevicesEnv(selected),
+				env:        profEnv,
 				collectGPU: true,
 				gpuIndices: selected,
 			},
@@ -648,11 +662,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
-	archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", err
 	}
 	return archive, nil
 }
 type satJob struct {
@@ -838,11 +848,7 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		}
 	}
-	archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", err
 	}
 	return archive, nil
 }
 func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
@@ -905,7 +911,7 @@ func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPU
 			entry.Health = "UNKNOWN"
 		}
 		if entry.Name == "" {
-			entry.Name = "unknown"
+			entry.Name = "Unknown GPU"
 		}
 		var body strings.Builder
 		fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -223,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		return "", err
 	}
-	archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", err
 	}
 	return archive, nil
 }
 func applyFanStressDefaults(opts *FanStressOptions) {
--- a/audit/internal/platform/techdump.go
+++ b/audit/internal/platform/techdump.go
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
 	{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
 	{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
 	{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
 	{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
 	{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
 	{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
 	{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
 	Loader            string
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	StaggerSeconds    int
 }
 func New() *System {
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -22,6 +22,10 @@ type RuntimeHealth struct {
 	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
 	NvidiaGSPMode string                 `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string                 `json:"network_status,omitempty"`
 	// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
 	ToRAMStatus   string `json:"toram_status,omitempty"`
 	// USBExportPath: mount point of the first writable USB drive found, empty if none.
 	USBExportPath string `json:"usb_export_path,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
 	Services      []RuntimeServiceStatus `json:"services,omitempty"`
@@ -183,6 +187,13 @@ type HardwarePCIeDevice struct {
 	BatteryTemperatureC    *float64       `json:"battery_temperature_c,omitempty"`
 	BatteryVoltageV        *float64       `json:"battery_voltage_v,omitempty"`
 	BatteryReplaceRequired *bool          `json:"battery_replace_required,omitempty"`
 	SFPPresent             *bool          `json:"sfp_present,omitempty"`
 	SFPIdentifier          *string        `json:"sfp_identifier,omitempty"`
 	SFPConnector           *string        `json:"sfp_connector,omitempty"`
 	SFPVendor              *string        `json:"sfp_vendor,omitempty"`
 	SFPPartNumber          *string        `json:"sfp_part_number,omitempty"`
 	SFPSerialNumber        *string        `json:"sfp_serial_number,omitempty"`
 	SFPWavelengthNM        *float64       `json:"sfp_wavelength_nm,omitempty"`
 	SFPTemperatureC        *float64       `json:"sfp_temperature_c,omitempty"`
 	SFPTXPowerDBM          *float64       `json:"sfp_tx_power_dbm,omitempty"`
 	SFPRXPowerDBM          *float64       `json:"sfp_rx_power_dbm,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -12,6 +12,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"sort"
 	"strconv"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -209,6 +210,14 @@ func joinTaskIndices(indices []int) string {
 	return strings.Join(parts, ",")
 }
 func formatGPUIndexList(indices []int) string {
 	parts := make([]string, len(indices))
 	for i, idx := range indices {
 		parts[i] = strconv.Itoa(idx)
 	}
 	return strings.Join(parts, ",")
 }
 func formatSplitTaskName(baseName, selectionLabel string) string {
 	baseName = strings.TrimSpace(baseName)
 	selectionLabel = strings.TrimSpace(selectionLabel)
@@ -487,6 +496,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 				StressMode         bool     `json:"stress_mode"`
 				GPUIndices         []int    `json:"gpu_indices"`
 				ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
 				StaggerGPUStart    bool     `json:"stagger_gpu_start"`
 				Loader             string   `json:"loader"`
 			Profile            string   `json:"profile"`
 			DisplayName        string   `json:"display_name"`
@@ -508,6 +518,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 				StressMode:         body.StressMode,
 				GPUIndices:         body.GPUIndices,
 				ExcludeGPUIndices:  body.ExcludeGPUIndices,
 				StaggerGPUStart:    body.StaggerGPUStart,
 				Loader:             body.Loader,
 			BurnProfile:        body.Profile,
 			DisplayName:        body.DisplayName,
@@ -538,6 +549,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
 		RunNCCL           *bool  `json:"run_nccl"`
 		ParallelGPUs      *bool  `json:"parallel_gpus"`
 		RampUp            *bool  `json:"ramp_up"`
 		DisplayName       string `json:"display_name"`
 	}
 	if r.Body != nil {
@@ -555,10 +567,82 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 	if body.ParallelGPUs != nil {
 		parallelGPUs = *body.ParallelGPUs
 	}
 	rampUp := false
 	if body.RampUp != nil {
 		rampUp = *body.RampUp
 	}
 	// Build a descriptive base name that includes profile and mode so the task
 	// list is self-explanatory without opening individual task detail pages.
 	profile := strings.TrimSpace(body.Profile)
 	if profile == "" {
 		profile = "standard"
 	}
 	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
 		name = body.DisplayName
 	}
 	// Append profile tag.
 	name = fmt.Sprintf("%s · %s", name, profile)
 	if rampUp && len(body.GPUIndices) > 1 {
 		// Ramp-up mode: resolve GPU list, then create one task per prefix
 		// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
 		gpus, err := apiListNvidiaGPUs(h.opts.App)
 		if err != nil {
 			writeError(w, http.StatusBadRequest, err.Error())
 			return
 		}
 		resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
 		if err != nil {
 			writeError(w, http.StatusBadRequest, err.Error())
 			return
 		}
 		if len(resolved) < 2 {
 			// Fall through to normal single-task path.
 			rampUp = false
 		} else {
 			now := time.Now()
 			rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
 			var allTasks []*Task
 			for step := 1; step <= len(resolved); step++ {
 				subset := resolved[:step]
 				stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
 				t := &Task{
 					ID:        newJobID("benchmark-nvidia"),
 					Name:      stepName,
 					Target:    "nvidia-benchmark",
 					Priority:  15,
 					Status:    TaskPending,
 					CreatedAt: now,
 					params: taskParams{
 						GPUIndices:       append([]int(nil), subset...),
 						SizeMB:           body.SizeMB,
 						BenchmarkProfile: body.Profile,
 						RunNCCL:          runNCCL && step == len(resolved),
 						ParallelGPUs:     true,
 						RampStep:         step,
 						RampTotal:        len(resolved),
 						RampRunID:        rampRunID,
 						DisplayName:      stepName,
 					},
 				}
 				allTasks = append(allTasks, t)
 			}
 			for _, t := range allTasks {
 				globalQueue.enqueue(t)
 			}
 			writeTaskRunResponse(w, allTasks)
 			return
 		}
 	}
 	// For non-ramp tasks append mode tag.
 	if parallelGPUs {
 		name = fmt.Sprintf("%s · parallel", name)
 	} else {
 		name = fmt.Sprintf("%s · sequential", name)
 	}
 	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
 		GPUIndices:        body.GPUIndices,
 		ExcludeGPUIndices: body.ExcludeGPUIndices,
@@ -1376,107 +1460,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
 	return nil
 }
 // ── Display / Screen Resolution ───────────────────────────────────────────────
 type displayMode struct {
 	Output  string `json:"output"`
 	Mode    string `json:"mode"`
 	Current bool   `json:"current"`
 }
 type displayInfo struct {
 	Output  string        `json:"output"`
 	Modes   []displayMode `json:"modes"`
 	Current string        `json:"current"`
 }
 var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
 var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
 var xrandrCurrentRE = regexp.MustCompile(`\*`)
 func parseXrandrOutput(out string) []displayInfo {
 	var infos []displayInfo
 	var cur *displayInfo
 	for _, line := range strings.Split(out, "\n") {
 		if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
 			if cur != nil {
 				infos = append(infos, *cur)
 			}
 			cur = &displayInfo{Output: m[1]}
 			continue
 		}
 		if cur == nil {
 			continue
 		}
 		if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
 			isCurrent := xrandrCurrentRE.MatchString(line)
 			mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
 			cur.Modes = append(cur.Modes, mode)
 			if isCurrent {
 				cur.Current = m[1]
 			}
 		}
 	}
 	if cur != nil {
 		infos = append(infos, *cur)
 	}
 	return infos
 }
 func xrandrCommand(args ...string) *exec.Cmd {
 	cmd := exec.Command("xrandr", args...)
 	env := append([]string{}, os.Environ()...)
 	hasDisplay := false
 	hasXAuthority := false
 	for _, kv := range env {
 		if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
 			hasDisplay = true
 		}
 		if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
 			hasXAuthority = true
 		}
 	}
 	if !hasDisplay {
 		env = append(env, "DISPLAY=:0")
 	}
 	if !hasXAuthority {
 		env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
 	}
 	cmd.Env = env
 	return cmd
 }
 func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
 	out, err := xrandrCommand().Output()
 	if err != nil {
 		writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
 		return
 	}
 	writeJSON(w, parseXrandrOutput(string(out)))
 }
 func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
 	var req struct {
 		Output string `json:"output"`
 		Mode   string `json:"mode"`
 	}
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
 		writeError(w, http.StatusBadRequest, "output and mode are required")
 		return
 	}
 	// Validate mode looks like WxH to prevent injection
 	if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
 		writeError(w, http.StatusBadRequest, "invalid mode format")
 		return
 	}
 	// Validate output name (no special chars)
 	if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
 		writeError(w, http.StatusBadRequest, "invalid output name")
 		return
 	}
 	if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
 		writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
 		return
 	}
 	writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
 }
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -10,30 +10,6 @@ import (
 	"bee/audit/internal/platform"
 )
 func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
 	t.Setenv("DISPLAY", "")
 	t.Setenv("XAUTHORITY", "")
 	cmd := xrandrCommand("--query")
 	var hasDisplay bool
 	var hasXAuthority bool
 	for _, kv := range cmd.Env {
 		if kv == "DISPLAY=:0" {
 			hasDisplay = true
 		}
 		if kv == "XAUTHORITY=/home/bee/.Xauthority" {
 			hasXAuthority = true
 		}
 	}
 	if !hasDisplay {
 		t.Fatalf("DISPLAY not injected: %v", cmd.Env)
 	}
 	if !hasXAuthority {
 		t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
 	}
 }
 func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
 		}
 	}
 	// Downsample to at most ~1400 points (one per pixel) before building SVG.
 	times, datasets = downsampleTimeSeries(times, datasets, 1400)
 	pointCount = len(times)
 	statsLabel := chartStatsLabel(datasets)
 	legendItems := []metricChartSeries{}
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
 		}
 	}
 	// Downsample to at most ~1400 points before building SVG.
 	{
 		datasets := make([][]float64, len(series))
 		for i := range series {
 			datasets[i] = series[i].Values
 		}
 		times, datasets = downsampleTimeSeries(times, datasets, 1400)
 		pointCount = len(times)
 		for i := range series {
 			series[i].Values = datasets[i]
 		}
 	}
 	scales := make([]chartScale, len(series))
 	for i := range series {
 		min, max := chartSeriesBounds(series[i].Values)
@@ -626,6 +643,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
 	b.WriteString(`</g>` + "\n")
 }
 // downsampleTimeSeries reduces the time series to at most maxPts points using
 // min-max bucketing. Each bucket contributes the index of its min and max value
 // (using the first full-length dataset as the reference series). All parallel
 // datasets are sampled at those same indices so all series stay aligned.
 // If len(times) <= maxPts the inputs are returned unchanged.
 func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
 	n := len(times)
 	if n <= maxPts || maxPts <= 0 {
 		return times, datasets
 	}
 	buckets := maxPts / 2
 	if buckets < 1 {
 		buckets = 1
 	}
 	// Use the first dataset that has the same length as times as the reference
 	// for deciding which two indices to keep per bucket.
 	var ref []float64
 	for _, ds := range datasets {
 		if len(ds) == n {
 			ref = ds
 			break
 		}
 	}
 	selected := make([]int, 0, maxPts)
 	bucketSize := float64(n) / float64(buckets)
 	for b := 0; b < buckets; b++ {
 		lo := int(math.Round(float64(b) * bucketSize))
 		hi := int(math.Round(float64(b+1) * bucketSize))
 		if hi > n {
 			hi = n
 		}
 		if lo >= hi {
 			continue
 		}
 		if ref == nil {
 			selected = append(selected, lo)
 			if hi-1 != lo {
 				selected = append(selected, hi-1)
 			}
 			continue
 		}
 		minIdx, maxIdx := lo, lo
 		for i := lo + 1; i < hi; i++ {
 			if ref[i] < ref[minIdx] {
 				minIdx = i
 			}
 			if ref[i] > ref[maxIdx] {
 				maxIdx = i
 			}
 		}
 		if minIdx <= maxIdx {
 			selected = append(selected, minIdx)
 			if maxIdx != minIdx {
 				selected = append(selected, maxIdx)
 			}
 		} else {
 			selected = append(selected, maxIdx)
 			if minIdx != maxIdx {
 				selected = append(selected, minIdx)
 			}
 		}
 	}
 	outTimes := make([]time.Time, len(selected))
 	for i, idx := range selected {
 		outTimes[i] = times[idx]
 	}
 	outDatasets := make([][]float64, len(datasets))
 	for d, ds := range datasets {
 		if len(ds) != n {
 			outDatasets[d] = ds
 			continue
 		}
 		out := make([]float64, len(selected))
 		for i, idx := range selected {
 			out[i] = ds[idx]
 		}
 		outDatasets[d] = out
 	}
 	return outTimes, outDatasets
 }
 func chartXForTime(ts, start, end time.Time, left, right int) float64 {
 	if !end.After(start) {
 		return float64(left+right) / 2
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -317,106 +317,326 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
 	if err != nil {
 		return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
 	}
-	// Parse just enough fields for the summary banner
+	var ingest schema.HardwareIngestRequest
-	var snap struct {
+	if err := json.Unmarshal(data, &ingest); err != nil {
 		Summary struct {
 			CPU     struct{ Model string }
 			Memory  struct{ TotalGB float64 }
 			Storage []struct{ Device, Model, Size string }
 			GPUs    []struct{ Model string }
 			PSUs    []struct{ Model string }
 		}
 		Network struct {
 			Interfaces []struct {
 				Name  string
 				IPv4  []string
 				State string
 			}
 		}
 	}
 	// Try to extract top-level fields loosely
 	var raw map[string]json.RawMessage
 	if err := json.Unmarshal(data, &raw); err != nil {
 		return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
 	}
-	_ = snap
+	hw := ingest.Hardware
-	// Also load runtime-health for badges
+	var records []app.ComponentStatusRecord
-	type componentHealth struct {
+	if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil {
-		FailCount int `json:"fail_count"`
+		records = db.All()
 		WarnCount int `json:"warn_count"`
 	}
 	type healthSummary struct {
 		CPU     componentHealth `json:"cpu"`
 		Memory  componentHealth `json:"memory"`
 		Storage componentHealth `json:"storage"`
 		GPU     componentHealth `json:"gpu"`
 		PSU     componentHealth `json:"psu"`
 		Network componentHealth `json:"network"`
 	}
 	var health struct {
 		HardwareHealth healthSummary `json:"hardware_health"`
 	}
 	if hdata, herr := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")); herr == nil {
 		_ = json.Unmarshal(hdata, &health)
 	}
 	badge := func(h componentHealth) string {
 		if h.FailCount > 0 {
 			return `<span class="badge badge-err">FAIL</span>`
 		}
 		if h.WarnCount > 0 {
 			return `<span class="badge badge-warn">WARN</span>`
 		}
 		return `<span class="badge badge-ok">OK</span>`
 	}
 	// Extract readable strings from raw JSON
 	getString := func(key string) string {
 		v, ok := raw[key]
 		if !ok {
 			return ""
 		}
 		var s string
 		if err := json.Unmarshal(v, &s); err == nil {
 			return s
 		}
 		return ""
 	}
 	cpuModel := getString("cpu_model")
 	memStr := getString("memory_summary")
 	gpuSummary := getString("gpu_summary")
 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
 	// Server identity block above the component table.
 	{
 		var model, serial string
 		parts := []string{}
 		if hw.Board.Manufacturer != nil && strings.TrimSpace(*hw.Board.Manufacturer) != "" {
 			parts = append(parts, strings.TrimSpace(*hw.Board.Manufacturer))
 		}
 		if hw.Board.ProductName != nil && strings.TrimSpace(*hw.Board.ProductName) != "" {
 			parts = append(parts, strings.TrimSpace(*hw.Board.ProductName))
 		}
 		if len(parts) > 0 {
 			model = strings.Join(parts, " ")
 		}
 		serial = strings.TrimSpace(hw.Board.SerialNumber)
 		if model != "" || serial != "" {
 			b.WriteString(`<div style="margin-bottom:14px">`)
 			if model != "" {
 				fmt.Fprintf(&b, `<div style="font-size:16px;font-weight:700;margin-bottom:2px">%s</div>`, html.EscapeString(model))
 			}
 			if serial != "" {
 				fmt.Fprintf(&b, `<div style="font-size:12px;color:var(--muted)">S/N: %s</div>`, html.EscapeString(serial))
 			}
 			b.WriteString(`</div>`)
 		}
 	}
 	b.WriteString(`<table style="width:auto">`)
 	writeRow := func(label, value, badgeHTML string) {
-		b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
+		b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
 			html.EscapeString(label), html.EscapeString(value), badgeHTML))
 	}
-	if cpuModel != "" {
+
-		writeRow("CPU", cpuModel, badge(health.HardwareHealth.CPU))
+	cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
-	} else {
+	writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
-		writeRow("CPU", "—", badge(health.HardwareHealth.CPU))
+
 	memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
 	writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
 	storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
 	writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
 	gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
 	writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
 	psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
 	if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
 		psuRow.Status = hwPSUStatus(hw.PowerSupplies)
 	}
-	if memStr != "" {
+	writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
-		writeRow("Memory", memStr, badge(health.HardwareHealth.Memory))
+
-	} else {
+	if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
-		writeRow("Memory", "—", badge(health.HardwareHealth.Memory))
+		writeRow("Network", nicDesc, "")
 	}
-	if gpuSummary != "" {
+
 		writeRow("GPU", gpuSummary, badge(health.HardwareHealth.GPU))
 	} else {
 		writeRow("GPU", "—", badge(health.HardwareHealth.GPU))
 	}
 	writeRow("Storage", "—", badge(health.HardwareHealth.Storage))
 	writeRow("PSU", "—", badge(health.HardwareHealth.PSU))
 	b.WriteString(`</table>`)
 	b.WriteString(`</div></div>`)
 	return b.String()
 }
 // hwDescribeCPU returns a human-readable CPU summary, e.g. "2× Intel Xeon Gold 6338".
 func hwDescribeCPU(hw schema.HardwareSnapshot) string {
 	counts := map[string]int{}
 	order := []string{}
 	for _, cpu := range hw.CPUs {
 		model := "Unknown CPU"
 		if cpu.Model != nil && *cpu.Model != "" {
 			model = *cpu.Model
 		}
 		if counts[model] == 0 {
 			order = append(order, model)
 		}
 		counts[model]++
 	}
 	if len(order) == 0 {
 		return "—"
 	}
 	parts := make([]string, 0, len(order))
 	for _, m := range order {
 		if counts[m] > 1 {
 			parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
 		} else {
 			parts = append(parts, m)
 		}
 	}
 	return strings.Join(parts, ", ")
 }
 // hwDescribeMemory returns a summary like "16× 32 GB DDR4".
 func hwDescribeMemory(hw schema.HardwareSnapshot) string {
 	type key struct {
 		sizeMB int
 		typ    string
 	}
 	counts := map[key]int{}
 	order := []key{}
 	for _, dimm := range hw.Memory {
 		if dimm.SizeMB == nil || *dimm.SizeMB == 0 {
 			continue
 		}
 		t := ""
 		if dimm.Type != nil {
 			t = *dimm.Type
 		}
 		k := key{*dimm.SizeMB, t}
 		if counts[k] == 0 {
 			order = append(order, k)
 		}
 		counts[k]++
 	}
 	if len(order) == 0 {
 		return "—"
 	}
 	parts := make([]string, 0, len(order))
 	for _, k := range order {
 		gb := k.sizeMB / 1024
 		desc := fmt.Sprintf("%d× %d GB", counts[k], gb)
 		if k.typ != "" {
 			desc += " " + k.typ
 		}
 		parts = append(parts, desc)
 	}
 	return strings.Join(parts, ", ")
 }
 // hwDescribeStorage returns a summary like "4× 3.84 TB NVMe, 2× 1.92 TB SATA".
 func hwDescribeStorage(hw schema.HardwareSnapshot) string {
 	type key struct {
 		sizeGB int
 		iface  string
 	}
 	counts := map[key]int{}
 	order := []key{}
 	for _, disk := range hw.Storage {
 		sz := 0
 		if disk.SizeGB != nil {
 			sz = *disk.SizeGB
 		}
 		iface := ""
 		if disk.Interface != nil {
 			iface = *disk.Interface
 		} else if disk.Type != nil {
 			iface = *disk.Type
 		}
 		k := key{sz, iface}
 		if counts[k] == 0 {
 			order = append(order, k)
 		}
 		counts[k]++
 	}
 	if len(order) == 0 {
 		return "—"
 	}
 	parts := make([]string, 0, len(order))
 	for _, k := range order {
 		var sizeStr string
 		if k.sizeGB >= 1000 {
 			sizeStr = fmt.Sprintf("%.2g TB", float64(k.sizeGB)/1000)
 		} else if k.sizeGB > 0 {
 			sizeStr = fmt.Sprintf("%d GB", k.sizeGB)
 		} else {
 			sizeStr = "?"
 		}
 		desc := fmt.Sprintf("%d× %s", counts[k], sizeStr)
 		if k.iface != "" {
 			desc += " " + k.iface
 		}
 		parts = append(parts, desc)
 	}
 	return strings.Join(parts, ", ")
 }
 // hwDescribeGPU returns a summary like "8× NVIDIA H100 80GB".
 func hwDescribeGPU(hw schema.HardwareSnapshot) string {
 	counts := map[string]int{}
 	order := []string{}
 	for _, dev := range hw.PCIeDevices {
 		if dev.DeviceClass == nil {
 			continue
 		}
 		if !isGPUDeviceClass(*dev.DeviceClass) {
 			continue
 		}
 		model := "Unknown GPU"
 		if dev.Model != nil && *dev.Model != "" {
 			model = *dev.Model
 		}
 		if counts[model] == 0 {
 			order = append(order, model)
 		}
 		counts[model]++
 	}
 	if len(order) == 0 {
 		return "—"
 	}
 	parts := make([]string, 0, len(order))
 	for _, m := range order {
 		if counts[m] > 1 {
 			parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
 		} else {
 			parts = append(parts, m)
 		}
 	}
 	return strings.Join(parts, ", ")
 }
 // hwPSUStatus returns "OK", "CRITICAL", "WARNING", or "UNKNOWN" based on
 // PSU statuses from the audit snapshot. Used as fallback when component-status.json
 // has no psu: records yet (e.g. first boot before audit writes them).
 func hwPSUStatus(psus []schema.HardwarePowerSupply) string {
 	worst := "UNKNOWN"
 	for _, psu := range psus {
 		if psu.Status == nil {
 			continue
 		}
 		switch strings.ToUpper(strings.TrimSpace(*psu.Status)) {
 		case "CRITICAL":
 			return "CRITICAL"
 		case "WARNING":
 			if worst != "CRITICAL" {
 				worst = "WARNING"
 			}
 		case "OK":
 			if worst == "UNKNOWN" {
 				worst = "OK"
 			}
 		}
 	}
 	return worst
 }
 // hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
 func hwDescribePSU(hw schema.HardwareSnapshot) string {
 	n := len(hw.PowerSupplies)
 	if n == 0 {
 		return "—"
 	}
 	// Try to get a consistent wattage
 	watt := 0
 	consistent := true
 	for _, psu := range hw.PowerSupplies {
 		if psu.WattageW == nil {
 			consistent = false
 			break
 		}
 		if watt == 0 {
 			watt = *psu.WattageW
 		} else if *psu.WattageW != watt {
 			consistent = false
 			break
 		}
 	}
 	if consistent && watt > 0 {
 		return fmt.Sprintf("%d× %d W", n, watt)
 	}
 	return fmt.Sprintf("%d× PSU", n)
 }
 // hwDescribeNIC returns a summary like "2× Mellanox ConnectX-6".
 func hwDescribeNIC(hw schema.HardwareSnapshot) string {
 	counts := map[string]int{}
 	order := []string{}
 	for _, dev := range hw.PCIeDevices {
 		isNIC := false
 		if dev.DeviceClass != nil {
 			c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
 			isNIC = c == "ethernetcontroller" || c == "networkcontroller" || strings.Contains(c, "fibrechannel")
 		}
 		if !isNIC && len(dev.MacAddresses) == 0 {
 			continue
 		}
 		model := ""
 		if dev.Model != nil && *dev.Model != "" {
 			model = *dev.Model
 		} else if dev.Manufacturer != nil && *dev.Manufacturer != "" {
 			model = *dev.Manufacturer + " NIC"
 		} else {
 			model = "NIC"
 		}
 		if counts[model] == 0 {
 			order = append(order, model)
 		}
 		counts[model]++
 	}
 	if len(order) == 0 {
 		return ""
 	}
 	parts := make([]string, 0, len(order))
 	for _, m := range order {
 		if counts[m] > 1 {
 			parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
 		} else {
 			parts = append(parts, m)
 		}
 	}
 	return strings.Join(parts, ", ")
 }
 func isGPUDeviceClass(class string) bool {
 	switch strings.TrimSpace(class) {
 	case "VideoController", "DisplayController", "ProcessingAccelerator":
 		return true
 	default:
 		return false
 	}
 }
 func renderAuditModal() string {
 	return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
  <div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative">
@@ -481,8 +701,9 @@ func renderHealthCard(opts HandlerOptions) string {
 		buildRuntimeAccelerationRow(health),
 		buildRuntimeToolsRow(health),
 		buildRuntimeServicesRow(health),
 		buildRuntimeUSBExportRow(health),
 		buildRuntimeToRAMRow(health),
 	}
 	rows = append(rows, buildHardwareComponentRows(opts.ExportDir)...)
 	b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
 	for _, row := range rows {
 		b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
@@ -578,7 +799,13 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
 	nonActive := make([]string, 0)
 	for _, svc := range health.Services {
 		state := strings.TrimSpace(strings.ToLower(svc.Status))
-		if state != "active" {
+		// "activating" and "deactivating" are transient states for oneshot services
 		// (RemainAfterExit=yes) — the service is running normally, not failed.
 		// Only "failed" and "inactive" (after services should be running) are problems.
 		switch state {
 		case "active", "activating", "deactivating", "reloading":
 			// OK — service is running or transitioning normally
 		default:
 			nonActive = append(nonActive, svc.Name+"="+svc.Status)
 		}
 	}
@@ -591,6 +818,51 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
 	return runtimeHealthRow{Title: "Bee Services", Status: status, Source: "ServiceState", Issue: issue}
 }
 func buildRuntimeUSBExportRow(health schema.RuntimeHealth) runtimeHealthRow {
 	path := strings.TrimSpace(health.USBExportPath)
 	if path != "" {
 		return runtimeHealthRow{
 			Title:  "USB Export Drive",
 			Status: "OK",
 			Source: "/proc/mounts + lsblk",
 			Issue:  path,
 		}
 	}
 	return runtimeHealthRow{
 		Title:  "USB Export Drive",
 		Status: "WARNING",
 		Source: "/proc/mounts + lsblk",
 		Issue:  "No writable USB drive mounted. Plug in a USB drive to enable log export.",
 	}
 }
 func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
 	switch strings.ToLower(strings.TrimSpace(health.ToRAMStatus)) {
 	case "ok":
 		return runtimeHealthRow{
 			Title:  "LiveCD in RAM",
 			Status: "OK",
 			Source: "live-boot / /proc/mounts",
 			Issue:  "",
 		}
 	case "failed":
 		return runtimeHealthRow{
 			Title:  "LiveCD in RAM",
 			Status: "FAILED",
 			Source: "live-boot / /proc/mounts",
 			Issue:  "toram boot parameter set but ISO is not mounted from RAM. Copy may have failed.",
 		}
 	default:
 		// toram not active — ISO still on original boot media (USB/CD)
 		return runtimeHealthRow{
 			Title:  "LiveCD in RAM",
 			Status: "WARNING",
 			Source: "live-boot / /proc/mounts",
 			Issue:  "ISO not copied to RAM. Use \u201cCopy to RAM\u201d to free the boot drive and improve performance.",
 		}
 	}
 }
 func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
 	path := filepath.Join(exportDir, "component-status.json")
 	db, err := app.OpenComponentStatusDB(path)
@@ -1031,25 +1303,23 @@ func renderValidate(opts HandlerOptions) string {
 	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
-<div class="card" style="margin-bottom:16px">
+	<div class="card" style="margin-bottom:16px">
 	  <div class="card-head">Validate Profile</div>
 	  <div class="card-body validate-profile-body">
 	    <div class="validate-profile-col">
      <div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
 	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
 	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
 	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
 	    </div>
 	    <div class="validate-profile-col validate-profile-action">
 	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
-      <button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
+	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
-    </div>
+	      <div style="margin-top:12px">
    <div class="validate-profile-col"></div>
  </div>
  <div class="card-body" style="padding-top:0;display:flex;justify-content:center">
 	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
 	      </div>
-</div>
+	    </div>
 	  </div>
 	</div>
 <div class="grid3">
 ` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
@@ -1085,12 +1355,6 @@ func renderValidate(opts HandlerOptions) string {
      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
    </div>
    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
    <div style="margin-top:10px;padding-top:10px;border-top:1px solid var(--border)">
      <label class="sat-gpu-row" title="When checked, multi-GPU tests (PSU Pulse, NCCL, NVBandwidth) run on ALL GPUs in the system regardless of the selection above.">
        <input type="checkbox" id="sat-multi-gpu-all" checked onchange="satUpdateGPUSelectionNote()">
        <span><strong>Multi-GPU tests</strong> — use all GPUs <span style="font-size:11px;color:var(--muted)">(PSU Pulse, NCCL, NVBandwidth)</span></span>
      </label>
    </div>
  </div>
 </div>
@@ -1143,16 +1407,6 @@ func renderValidate(opts HandlerOptions) string {
 		`</div>` +
 		`</div>
 <div class="grid3" style="margin-top:16px">
 ` + `<div id="sat-card-hpl">` +
 		renderSATCard("hpl", "LINPACK (HPL)", "runSAT('hpl')", "", renderValidateCardBody(
 			``,
 			`Standard High Performance LINPACK benchmark. Measures sustained FP64 GFLOPS and memory bandwidth of the CPU subsystem. Uses 80% of available RAM. Pass/fail based on HPL residual check.`,
 			`<code>xhpl</code> (HPL 2.3, OpenBLAS)`,
 			`Skipped in Validate mode. Runs in Stress mode only. Runtime scales with RAM — expect 5–30 min.<p id="sat-hpl-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`</div>
 <div class="grid3" style="margin-top:16px">
 ` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
 		inv.AMD,
 		`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
@@ -1166,7 +1420,7 @@ func renderValidate(opts HandlerOptions) string {
 </div>
 <style>
 .validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
-.validate-profile-col { min-width:0; }
+.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
 .validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
 .validate-card-body { padding:0; }
 .validate-card-section { padding:12px 16px 0; }
@@ -1188,7 +1442,6 @@ function satModeChanged() {
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
    {card: 'sat-card-hpl',                    hint: 'sat-hpl-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
@@ -1199,7 +1452,7 @@ function satModeChanged() {
  });
 }
 function satLabels() {
-  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', hpl:'LINPACK (HPL)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
@@ -1220,10 +1473,6 @@ function satSelectedGPUIndices() {
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
 function satMultiGPUAll() {
  const cb = document.getElementById('sat-multi-gpu-all');
  return cb ? cb.checked : true;
 }
 function satUpdateGPUSelectionNote() {
  const note = document.getElementById('sat-gpu-selection-note');
  if (!note) return;
@@ -1232,8 +1481,7 @@ function satUpdateGPUSelectionNote() {
    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
    return;
  }
-  const multiAll = satMultiGPUAll();
+  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests: ' + (multiAll ? 'all GPUs in system' : 'selected GPUs only') + '.';
 }
 function satRenderGPUList(gpus) {
  const root = document.getElementById('sat-gpu-list');
@@ -1347,15 +1595,8 @@ const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targete
 // pulse_test and fabric tests run on all selected GPUs simultaneously
 const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
 function satAllGPUIndicesForMulti() {
-  // If "Multi-GPU tests — all GPUs" is checked, return all detected GPUs.
+  // Multi-GPU tests always use the current GPU selection.
-  // Otherwise fall back to the per-GPU selection.
+  return Promise.resolve(satSelectedGPUIndices());
  if (satMultiGPUAll()) {
    return loadSatNvidiaGPUs().then(function(gpus) {
      return gpus.map(function(g) { return Number(g.index); });
    });
  }
  const sel = satSelectedGPUIndices();
  return Promise.resolve(sel);
 }
 function expandSATTarget(target) {
  if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
@@ -1445,11 +1686,11 @@ function runAMDValidateSet() {
  return runNext(0);
 }
 function runAllSAT() {
-  const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
+  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
-  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
+  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
    const btn = document.getElementById('sat-btn-' + target);
@@ -1623,6 +1864,11 @@ func formatValidateDeviceSummary(total int, models map[string]int, unit string)
 	if total != 1 {
 		label += "s"
 	}
 	// If there is only one model the leading count duplicates the per-model
 	// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
 	if len(parts) == 1 {
 		return parts[0] + " " + label
 	}
 	return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
 }
@@ -1727,12 +1973,16 @@ func renderBenchmark(opts HandlerOptions) string {
        </div>
      </div>
      <label class="benchmark-cb-row">
-        <input type="checkbox" id="benchmark-parallel-gpus">
+        <input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
-        <span>Run all selected GPUs simultaneously (parallel mode)</span>
+        <span>Sequential — one GPU at a time</span>
      </label>
-      <label class="benchmark-cb-row">
+      <label class="benchmark-cb-row" id="benchmark-parallel-label">
-        <input type="checkbox" id="benchmark-run-nccl" checked>
+        <input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
-        <span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
+        <span>Parallel — all selected GPUs simultaneously</span>
      </label>
      <label class="benchmark-cb-row" id="benchmark-ramp-label">
        <input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
      </label>
      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
      <button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button>
@@ -1785,22 +2035,28 @@ function benchmarkSelectedGPUIndices() {
    .sort(function(a, b) { return a - b; });
 }
 function benchmarkMode() {
  const el = document.querySelector('input[name="benchmark-mode"]:checked');
  return el ? el.value : 'sequential';
 }
 function benchmarkUpdateSelectionNote() {
  const selected = benchmarkSelectedGPUIndices();
  const btn = document.getElementById('benchmark-run-btn');
  const note = document.getElementById('benchmark-selection-note');
  const nccl = document.getElementById('benchmark-run-nccl');
  if (!selected.length) {
    btn.disabled = true;
    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
    return;
  }
  btn.disabled = false;
-  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '.';
+  const mode = benchmarkMode();
-  if (nccl && nccl.checked && selected.length < 2) {
+  if (mode === 'ramp-up') {
-    note.textContent += ' NCCL will be skipped because fewer than 2 GPUs are selected.';
+    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.';
-  } else if (nccl && nccl.checked) {
+  } else if (mode === 'parallel') {
-    note.textContent += ' NCCL interconnect will use only these GPUs.';
+    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : '');
  } else {
    note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : '');
  }
 }
@@ -1818,6 +2074,33 @@ function benchmarkRenderGPUList(gpus) {
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
  benchmarkApplyMultiGPUState(gpus.length);
  benchmarkUpdateSelectionNote();
 }
 // Disable radio options that require multiple GPUs when only one is present.
 function benchmarkApplyMultiGPUState(gpuCount) {
  var multiValues = ['parallel', 'ramp-up'];
  var radios = document.querySelectorAll('input[name="benchmark-mode"]');
  radios.forEach(function(el) {
    var isMulti = multiValues.indexOf(el.value) >= 0;
    if (gpuCount < 2 && isMulti) {
      el.disabled = true;
      if (el.checked) {
        // fall back to sequential
        var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
        if (seq) seq.checked = true;
      }
      var label = el.closest('label');
      if (label) label.style.opacity = '0.4';
    } else {
      el.disabled = false;
      // restore default: ramp-up checked when ≥2 GPUs
      if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
      var label = el.closest('label');
      if (label) label.style.opacity = '';
    }
  });
  benchmarkUpdateSelectionNote();
 }
@@ -1855,12 +2138,15 @@ function runNvidiaBenchmark() {
    return;
  }
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
-  const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked;
+  const mode = benchmarkMode();
  const rampUp = mode === 'ramp-up' && selected.length > 1;
  const parallelGPUs = mode === 'parallel';
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
-    run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
+    run_nccl: selected.length > 1,
    parallel_gpus: parallelGPUs,
    ramp_up: rampUp,
    display_name: 'NVIDIA Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
@@ -1915,7 +2201,6 @@ function runNvidiaBenchmark() {
  });
 }
 document.getElementById('benchmark-run-nccl').addEventListener('change', benchmarkUpdateSelectionNote);
 benchmarkLoadGPUs();
 </script>`
 }
@@ -2106,11 +2391,11 @@ func renderBurn() string {
      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
    </div>
    <div class="burn-profile-col burn-profile-action">
-      <button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
+      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
      <p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
    </div>
    <div class="burn-profile-col burn-profile-action">
-      <button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
+      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
      <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
    </div>
  </div>
@@ -2131,8 +2416,22 @@ func renderBurn() string {
 	      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
 	    </div>
 	    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
 	    <div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
 	      <label class="cb-row">
 	        <input type="radio" name="burn-nvidia-mode" value="sequential" checked>
 	        <span>Sequential — selected GPUs one at a time</span>
 	      </label>
 	      <label class="cb-row" id="burn-parallel-label">
 	        <input type="radio" name="burn-nvidia-mode" value="parallel">
 	        <span>Parallel — all selected GPUs simultaneously</span>
 	      </label>
 	      <label class="cb-row" id="burn-ramp-label">
 	        <input type="radio" name="burn-nvidia-mode" value="ramp-up">
 	        <span>Ramp-up — add one GPU at a time</span>
 	      </label>
 	    </div>
 	  </div>
 	</div>
 </div>
 <div class="burn-section">Core Burn Paths</div>
 <div class="grid2 burn-grid" style="margin-bottom:16px">
@@ -2158,10 +2457,6 @@ func renderBurn() string {
 </div>
 </div>
 <div class="burn-section">GPU-Specific Tests</div>
 <div class="grid2 burn-grid" style="margin-bottom:16px">
 </div>
 <div id="bi-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Output <span id="bi-title"></span></div>
  <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
@@ -2210,6 +2505,32 @@ function burnSelectedGPUIndices() {
    .sort(function(a, b) { return a - b; });
 }
 function burnNvidiaMode() {
  const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
  return el ? el.value : 'sequential';
 }
 function burnApplyMultiGPUState(gpuCount) {
  var multiValues = ['parallel', 'ramp-up'];
  var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
  radios.forEach(function(el) {
    var isMulti = multiValues.indexOf(el.value) >= 0;
    if (gpuCount < 2 && isMulti) {
      el.disabled = true;
      if (el.checked) {
        var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
        if (seq) seq.checked = true;
      }
      var label = el.closest('label');
      if (label) label.style.opacity = '0.4';
    } else {
      el.disabled = false;
      var label = el.closest('label');
      if (label) label.style.opacity = '';
    }
  });
 }
 function burnUpdateSelectionNote() {
  const note = document.getElementById('burn-selection-note');
  const selected = burnSelectedGPUIndices();
@@ -2234,6 +2555,7 @@ function burnRenderGPUList(gpus) {
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
  burnApplyMultiGPUState(gpus.length);
  burnUpdateSelectionNote();
 }
@@ -2269,6 +2591,12 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
      return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
    }
    body.gpu_indices = selected;
    const bMode = burnNvidiaMode();
    if (bMode === 'ramp-up' && selected.length > 1) {
      body.stagger_gpu_start = true;
    } else if (bMode === 'parallel' && selected.length > 1) {
      body.parallel_gpus = true;
    }
  }
  return fetch('/api/sat/' + target + '/run', {
    method: 'POST',
@@ -2860,56 +3188,6 @@ usbRefresh();
 </script>`
 }
 // ── Display Resolution ────────────────────────────────────────────────────────
 func renderDisplayInline() string {
 	return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
 <div id="display-controls"></div>
 <script>
 (function(){
 function loadDisplays() {
  fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
    const status = document.getElementById('display-status');
    const ctrl = document.getElementById('display-controls');
    if (!displays || displays.length === 0) {
      status.textContent = 'No connected displays found or xrandr not available.';
      return;
    }
    status.textContent = '';
    ctrl.innerHTML = displays.map(d => {
      const opts = (d.modes||[]).map(m =>
        '<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
      ).join('');
      return '<div style="margin-bottom:12px">'
        +'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
        +'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
        +'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
        +'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
        +'</div>';
    }).join('');
  }).catch(()=>{
    document.getElementById('display-status').textContent = 'xrandr not available on this system.';
  });
 }
 window.applyResolution = function(output) {
  const sel = document.getElementById('res-sel-'+output);
  if (!sel) return;
  const mode = sel.value;
  const btn = sel.nextElementSibling;
  btn.disabled = true;
  btn.textContent = 'Applying...';
  fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
    .then(r=>r.json()).then(d=>{
      if (d.error) { alert('Error: '+d.error); }
      loadDisplays();
    }).catch(e=>{ alert('Error: '+e); })
    .finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
 };
 loadDisplays();
 })();
 </script>`
 }
 func renderNvidiaSelfHealInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
 <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
@@ -3097,8 +3375,6 @@ function installToRAM() {
 <div class="card"><div class="card-head">Services</div><div class="card-body">` +
 		renderServicesInline() + `</div></div>
 <div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
 		renderDisplayInline() + `</div></div>
 <script>
 function checkTools() {
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -295,10 +295,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Tools
 	mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
 	// Display
 	mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
 	mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
 	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -1094,6 +1094,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		// Runtime Health card — LiveCD checks only
 		`Runtime Health`,
 		`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
 		`Export Directory`,
@@ -1102,16 +1103,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 		`CUDA / ROCm`,
 		`Required Utilities`,
 		`Bee Services`,
 		`<td>CPU</td>`,
 		`<td>Memory</td>`,
 		`<td>Storage</td>`,
 		`<td>GPU</td>`,
 		`CUDA runtime is not ready for GPU SAT.`,
 		`Missing: nvidia-smi`,
 		`bee-nvidia=inactive`,
-		`cpu SAT: FAILED`,
+		// Hardware Summary card — component health badges
-		`storage SAT: FAILED`,
+		`Hardware Summary`,
-		`sat:nvidia`,
+		`>CPU<`,
 		`>Memory<`,
 		`>Storage<`,
 		`>GPU<`,
 		`>PSU<`,
 		`badge-warn`,   // cpu Warning badge
 		`badge-err`,    // storage Critical badge
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("dashboard missing %q: %s", needle, body)
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -39,7 +39,6 @@ var taskNames = map[string]string{
 	"nvidia-interconnect":    "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
 	"nvidia-bandwidth":       "NVIDIA Bandwidth Test (NVBandwidth)",
 	"nvidia-stress":          "NVIDIA GPU Stress",
 	"hpl":                    "LINPACK (HPL)",
 	"memory":                 "Memory SAT",
 	"storage":                "Storage SAT",
 	"cpu":                    "CPU SAT",
@@ -119,6 +118,7 @@ type taskParams struct {
 	StressMode         bool     `json:"stress_mode,omitempty"`
 	GPUIndices         []int    `json:"gpu_indices,omitempty"`
 	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
 	StaggerGPUStart    bool     `json:"stagger_gpu_start,omitempty"`
 	SizeMB             int      `json:"size_mb,omitempty"`
 	Passes             int      `json:"passes,omitempty"`
 	Loader             string   `json:"loader,omitempty"`
@@ -126,6 +126,9 @@ type taskParams struct {
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	RampStep           int      `json:"ramp_step,omitempty"`
 	RampTotal          int      `json:"ramp_total,omitempty"`
 	RampRunID          string   `json:"ramp_run_id,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
@@ -152,6 +155,12 @@ type burnPreset struct {
 	DurationSec int
 }
 type nvidiaRampSpec struct {
 	DurationSec      int
 	StaggerSeconds   int
 	TotalDurationSec int
 }
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
@@ -163,6 +172,45 @@ func resolveBurnPreset(profile string) burnPreset {
 	}
 }
 func resolveNvidiaRampPlan(profile string, enabled bool, selected []int) (nvidiaRampSpec, error) {
 	base := resolveBurnPreset(profile).DurationSec
 	plan := nvidiaRampSpec{
 		DurationSec:      base,
 		TotalDurationSec: base,
 	}
 	if !enabled {
 		return plan, nil
 	}
 	count := len(selected)
 	if count == 0 {
 		return nvidiaRampSpec{}, fmt.Errorf("staggered NVIDIA burn requires explicit GPU selection")
 	}
 	if count == 1 {
 		return plan, nil
 	}
 	switch profile {
 	case "acceptance":
 		plan.StaggerSeconds = 10 * 60
 		plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
 	case "overnight":
 		plan.StaggerSeconds = 60 * 60
 		plan.TotalDurationSec = 8 * 60 * 60
 		minTotal := count * 60 * 60
 		if plan.TotalDurationSec < minTotal {
 			plan.TotalDurationSec = minTotal
 		}
 		if plan.TotalDurationSec > 10*60*60 {
 			return nvidiaRampSpec{}, fmt.Errorf("overnight staggered NVIDIA burn supports at most 10 GPUs")
 		}
 		plan.DurationSec = plan.TotalDurationSec - plan.StaggerSeconds*(count-1)
 	default:
 		plan.StaggerSeconds = 2 * 60
 		plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
 	}
 	return plan, nil
 }
 func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
 	acceptanceCycles := []platform.PlatformStressCycle{
 		{LoadSec: 85, IdleSec: 5},
@@ -592,6 +640,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RunNCCL:           t.params.RunNCCL,
 			ParallelGPUs:      t.params.ParallelGPUs,
 			RampStep:          t.params.RampStep,
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
 	case "nvidia-compute":
 		if a == nil {
@@ -602,7 +653,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
-		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
 		if planErr != nil {
 			err = planErr
 			break
 		}
 		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
 			dur = rampPlan.DurationSec
 		}
 		if rampPlan.StaggerSeconds > 0 {
 			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
 		}
 		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
 	case "nvidia-targeted-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -652,11 +714,23 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
 		if planErr != nil {
 			err = planErr
 			break
 		}
 		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
 			dur = rampPlan.DurationSec
 		}
 		if rampPlan.StaggerSeconds > 0 {
 			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
 		}
 		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
 			DurationSec:       dur,
 			Loader:            t.params.Loader,
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			StaggerSeconds:    rampPlan.StaggerSeconds,
 		}, j.append)
 	case "memory":
 		if a == nil {
@@ -740,19 +814,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
 	case "hpl":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		opts := platform.HPLOptions{
 			MemFraction: 0.80,
 			NB:          256,
 		}
 		archive, err = func() (string, error) {
 			path, _, runErr := a.RunHPL(ctx, "", opts, j.append)
 			return path, runErr
 		}()
 	case "platform-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -491,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
 	}
 }
 func TestResolveNvidiaRampPlan(t *testing.T) {
 	tests := []struct {
 		name     string
 		profile  string
 		enabled  bool
 		selected []int
 		want     nvidiaRampSpec
 		wantErr  string
 	}{
 		{
 			name:     "disabled uses base preset",
 			profile:  "acceptance",
 			selected: []int{0, 1},
 			want:     nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
 		},
 		{
 			name:     "smoke ramp uses two minute steps",
 			profile:  "smoke",
 			enabled:  true,
 			selected: []int{0, 1, 2},
 			want:     nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
 		},
 		{
 			name:     "acceptance ramp uses ten minute steps",
 			profile:  "acceptance",
 			enabled:  true,
 			selected: []int{0, 1, 2},
 			want:     nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
 		},
 		{
 			name:     "overnight stays at eight hours when possible",
 			profile:  "overnight",
 			enabled:  true,
 			selected: []int{0, 1, 2},
 			want:     nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
 		},
 		{
 			name:     "overnight extends to keep one hour after final gpu",
 			profile:  "overnight",
 			enabled:  true,
 			selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
 			want:     nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
 		},
 		{
 			name:     "overnight rejects impossible gpu count",
 			profile:  "overnight",
 			enabled:  true,
 			selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
 			wantErr:  "at most 10 GPUs",
 		},
 		{
 			name:    "enabled requires explicit selection",
 			profile: "smoke",
 			enabled: true,
 			wantErr: "requires explicit GPU selection",
 		},
 	}
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
 			if tc.wantErr != "" {
 				if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
 					t.Fatalf("err=%v want substring %q", err, tc.wantErr)
 				}
 				return
 			}
 			if err != nil {
 				t.Fatalf("resolveNvidiaRampPlan error: %v", err)
 			}
 			if got != tc.want {
 				t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
 			}
 		})
 	}
 }
 func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
 	tests := []struct {
 		loader string
--- a/bible-local/docs/gpu-model-propagation.md
+++ b/bible-local/docs/gpu-model-propagation.md
@@ -0,0 +1,117 @@
 # GPU Model Name Propagation
 How GPU model names are detected, stored, and displayed throughout the project.
 ---
 ## Detection Sources
 There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
 ### Pipeline A — Live / SAT (nvidia-smi query at runtime)
 **File:** `audit/internal/platform/sat.go`
 - `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
 - `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
 - Used by: GPU selection UI, live metrics labels, burn/stress test logic
 ### Pipeline B — Benchmark results
 **File:** `audit/internal/platform/benchmark.go`, line 124
 - `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
 - Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
 - Used by: benchmark history table, benchmark report
 ### Pipeline C — Hardware audit JSON (PCIe schema)
 **File:** `audit/internal/schema/hardware.go`
 - `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
 - For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
 - For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
 - Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
 ---
 ## Key Inconsistency: NVIDIA PCIe Model is Never Set
 `audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
 This means:
 - Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
 - AMD GPUs do have their model populated
 The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
 ---
 ## Benchmark History "Unknown GPU" Issue
 **Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
 **Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
 This happens for:
 - Older result files saved before the `Name` field was added
 - Runs where nvidia-smi query failed before the benchmark started
 ---
 ## Fallback Strings — Current State
 | Location | File | Fallback string |
 |---|---|---|
 | Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
 | Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
 | Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
 | Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
 | Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
 | Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
 | SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
 | GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
 **Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
 ---
 ## GPU Selection UI
 **File:** `audit/internal/webui/pages.go`
 - Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
 - Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
 - Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
 This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
 ---
 ## Data Flow Summary
 ```
 nvidia-smi (live)
  └─ ListNvidiaGPUs() → NvidiaGPU.Name
       ├─ GPU selection UI (always correct)
       ├─ Live metrics labels (charts_svg.go)
       └─ SAT/burn status file (sat.go)
 nvidia-smi (at benchmark start)
  └─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
       └─ BenchmarkGPUResult.Name (json:"name,omitempty")
            ├─ Benchmark report
            └─ Benchmark history table columns
 nvidia-smi / lspci (audit collection)
  └─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
       └─ Hardware summary page hwDescribeGPU()
 ```
 ---
 ## What Needs Fixing
 1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
 2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
 3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -19,7 +19,5 @@ ROCRAND_VERSION=3.2.0.60304-76~22.04
 HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
 HIPBLASLT_VERSION=0.10.0.60304-76~22.04
 COMGR_VERSION=2.8.0.60304-76~22.04
 HPL_VERSION=2.3
 HPL_SHA256=32c5c17d22330e6f2337b681aded51637fb6008d3f0eb7c277b163fadd612830
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
--- a/iso/builder/build-hpl.sh
+++ b/iso/builder/build-hpl.sh
@@ -1,331 +0,0 @@
 #!/bin/sh
 # build-hpl.sh — build HPL (High Performance LINPACK) for the bee LiveCD.
 #
 # Downloads HPL 2.3 from netlib, downloads OpenBLAS runtime from the Debian 12
 # apt repo, and compiles xhpl using a minimal single-process MPI stub so that
 # no MPI package is required inside the ISO.
 #
 # The resulting xhpl binary is a standard HPL binary whose output is compatible
 # with the accepted HPL format (WR... Gflops lines).
 #
 # Output:
 #   $CACHE_DIR/bin/xhpl
 #   $CACHE_DIR/lib/libopenblas.so*   (runtime, injected into ISO /usr/lib/)
 set -e
 HPL_VERSION="$1"
 HPL_SHA256="$2"
 DIST_DIR="$3"
 [ -n "$HPL_VERSION" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
 [ -n "$HPL_SHA256"  ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
 [ -n "$DIST_DIR"    ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
 echo "=== HPL ${HPL_VERSION} ==="
 CACHE_DIR="${DIST_DIR}/hpl-${HPL_VERSION}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/hpl-downloads"
 if [ -x "${CACHE_DIR}/bin/xhpl" ]; then
    echo "=== HPL cached, skipping build ==="
    echo "binary: ${CACHE_DIR}/bin/xhpl"
    exit 0
 fi
 mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/bin" "${CACHE_DIR}/lib"
 # ── download HPL source ────────────────────────────────────────────────────────
 HPL_TAR="${DOWNLOAD_CACHE_DIR}/hpl-${HPL_VERSION}.tar.gz"
 DEFAULT_HPL_URLS="
 https://www.netlib.org/benchmark/hpl/hpl-${HPL_VERSION}.tar.gz
 https://fossies.org/linux/privat/hpl-${HPL_VERSION}.tar.gz
 "
 HPL_GIT_URL="${HPL_GIT_URL:-https://github.com/icl-utk-edu/hpl.git}"
 DEFAULT_HPL_GIT_REFS="v${HPL_VERSION} ${HPL_VERSION} main"
 HPL_SOURCE_MODE="tarball"
 download_to_file() {
    url="$1"
    out="$2"
    if command -v curl >/dev/null 2>&1; then
        curl -fL \
            --connect-timeout 15 \
            --max-time 180 \
            --retry 2 \
            --retry-delay 2 \
            --output "${out}" \
            "${url}"
        return $?
    fi
    wget \
        --show-progress \
        --tries=2 \
        --timeout=30 \
        -O "${out}" \
        "${url}"
 }
 download_hpl_tarball() {
    out="$1"
    tmp="${out}.part"
    urls="${HPL_URLS:-$DEFAULT_HPL_URLS}"
    rm -f "${tmp}"
    for url in ${urls}; do
        [ -n "${url}" ] || continue
        echo "=== trying HPL source: ${url} ==="
        if download_to_file "${url}" "${tmp}"; then
            mv "${tmp}" "${out}"
            return 0
        fi
        rm -f "${tmp}"
        echo "=== failed: ${url} ==="
    done
    echo "ERROR: failed to download HPL ${HPL_VERSION} from all configured URLs" >&2
    return 1
 }
 download_hpl_from_git_archive() {
    out="$1"
    refs="${HPL_GIT_REFS:-$DEFAULT_HPL_GIT_REFS}"
    tmp_root="$(mktemp -d)"
    repo_dir="${tmp_root}/repo"
    archive_dir="${tmp_root}/hpl-${HPL_VERSION}"
    archive_tmp="${out}.part"
    for ref in ${refs}; do
        [ -n "${ref}" ] || continue
        echo "=== trying HPL git source: ${HPL_GIT_URL} ref ${ref} ==="
        rm -rf "${repo_dir}" "${archive_dir}" "${archive_tmp}"
        if git clone --depth 1 --branch "${ref}" "${HPL_GIT_URL}" "${repo_dir}"; then
            mv "${repo_dir}" "${archive_dir}"
            tar czf "${archive_tmp}" -C "${tmp_root}" "hpl-${HPL_VERSION}"
            mv "${archive_tmp}" "${out}"
            rm -rf "${tmp_root}"
            HPL_SOURCE_MODE="git"
            return 0
        fi
        echo "=== failed git ref: ${ref} ==="
    done
    rm -rf "${tmp_root}" "${archive_tmp}"
    echo "ERROR: failed to obtain HPL ${HPL_VERSION} from all configured sources" >&2
    echo "  looked for cache: ${out}" >&2
    echo "  tarball mirrors: ${HPL_URLS:-$DEFAULT_HPL_URLS}" >&2
    echo "  git fallback: ${HPL_GIT_URL} refs ${refs}" >&2
    echo "  override mirrors with HPL_URLS=\"https://mirror1/...\"" >&2
    echo "  override git refs with HPL_GIT_REFS=\"v${HPL_VERSION} ${HPL_VERSION} main\"" >&2
    return 1
 }
 if [ ! -f "${HPL_TAR}" ]; then
    echo "=== downloading HPL ${HPL_VERSION} ==="
    download_hpl_tarball "${HPL_TAR}" || download_hpl_from_git_archive "${HPL_TAR}"
 fi
 if [ "${HPL_SOURCE_MODE}" = "tarball" ]; then
    actual_sha="$(sha256sum "${HPL_TAR}" | awk '{print $1}')"
    if [ "${actual_sha}" != "${HPL_SHA256}" ]; then
        echo "ERROR: sha256 mismatch for hpl-${HPL_VERSION}.tar.gz" >&2
        echo "  expected: ${HPL_SHA256}" >&2
        echo "  actual:   ${actual_sha}" >&2
        rm -f "${HPL_TAR}"
        exit 1
    fi
    echo "sha256 OK: hpl-${HPL_VERSION}.tar.gz"
 else
    echo "=== HPL source obtained from git fallback; skipping tarball sha256 check ==="
 fi
 # ── download OpenBLAS from Debian 12 apt repo ─────────────────────────────────
 REPO_BASE="https://deb.debian.org/debian/pool/main/o/openblas"
 PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
 OPENBLAS_PKG="libopenblas0-openmp"
 echo "=== fetching Debian 12 Packages.gz ==="
 wget -q -O "${PACKAGES_GZ}" \
    "https://deb.debian.org/debian/dists/bookworm/main/binary-amd64/Packages.gz"
 lookup_deb() {
    pkg="$1"
    gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" '
        /^Package: / { cur=$2 }
        /^Filename: / { file=$2 }
        /^SHA256: /  { sha=$2 }
        /^$/ {
            if (cur == pkg) { print file " " sha; exit }
            cur=""; file=""; sha=""
        }
        END {
            if (cur == pkg) print file " " sha
        }'
 }
 meta="$(lookup_deb "${OPENBLAS_PKG}")"
 [ -n "$meta" ] || { echo "ERROR: ${OPENBLAS_PKG} not found in Packages.gz"; exit 1; }
 repo_file="$(printf '%s' "$meta" | awk '{print $1}')"
 repo_sha="$(printf '%s'  "$meta" | awk '{print $2}')"
 OPENBLAS_DEB="${DOWNLOAD_CACHE_DIR}/$(basename "${repo_file}")"
 if [ -f "${OPENBLAS_DEB}" ]; then
    actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
    [ "$actual" = "$repo_sha" ] || rm -f "${OPENBLAS_DEB}"
 fi
 if [ ! -f "${OPENBLAS_DEB}" ]; then
    echo "=== downloading ${OPENBLAS_PKG} ==="
    wget --show-progress -O "${OPENBLAS_DEB}" "https://deb.debian.org/debian/${repo_file}"
    actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
    [ "$actual" = "$repo_sha" ] || { echo "ERROR: sha256 mismatch for ${OPENBLAS_PKG}"; rm -f "${OPENBLAS_DEB}"; exit 1; }
 fi
 # extract libopenblas shared libs
 TMP_DEB=$(mktemp -d)
 trap 'rm -rf "${TMP_DEB}" "${BUILD_TMP:-}"' EXIT INT TERM
 (
    cd "${TMP_DEB}"
    ar x "${OPENBLAS_DEB}"
    tar xf data.tar.*
 )
 find "${TMP_DEB}" \( -name 'libopenblas*.so*' \) \( -type f -o -type l \) \
    -exec cp -a {} "${CACHE_DIR}/lib/" \;
 echo "=== OpenBLAS libs: $(ls "${CACHE_DIR}/lib/" | wc -l) files ==="
 # also need libopenblas-dev header for compilation (we only need the .so symlink)
 OPENBLAS_SO="$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libopenblas.so.*' -type f | sort | head -1)"
 [ -n "${OPENBLAS_SO}" ] || { echo "ERROR: libopenblas.so not extracted"; exit 1; }
 SONAME="$(basename "${OPENBLAS_SO}")"
 ln -sf "${SONAME}" "${CACHE_DIR}/lib/libopenblas.so" 2>/dev/null || true
 ln -sf "${SONAME}" "${CACHE_DIR}/lib/libblas.so" 2>/dev/null || true
 # ── build HPL ─────────────────────────────────────────────────────────────────
 BUILD_TMP=$(mktemp -d)
 cd "${BUILD_TMP}"
 tar xf "${HPL_TAR}"
 SRC_DIR="$(find . -maxdepth 1 -type d -name 'hpl-*' | head -1)"
 [ -n "${SRC_DIR}" ] || { echo "ERROR: HPL source dir not found"; exit 1; }
 cd "${SRC_DIR}"
 # Write a minimal single-process MPI stub so we don't need an MPI package.
 # HPL only needs these functions for single-process execution.
 cat > "${BUILD_TMP}/mpi_stub.c" <<'MPISTUB'
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 typedef int MPI_Comm;
 typedef int MPI_Datatype;
 typedef int MPI_Op;
 typedef int MPI_Status;
 typedef int MPI_Request;
 #define MPI_COMM_WORLD 0
 #define MPI_SUCCESS    0
 #define MPI_DOUBLE     6
 #define MPI_INT        5
 #define MPI_SUM        0
 #define MPI_MAX        1
 #define MPI_MIN        2
 #define MPI_BYTE       1
 #define MPI_ANY_SOURCE -1
 #define MPI_ANY_TAG    -1
 #define MPI_STATUS_IGNORE ((MPI_Status*)0)
 int MPI_Init(int *argc, char ***argv)          { (void)argc; (void)argv; return MPI_SUCCESS; }
 int MPI_Finalize(void)                          { return MPI_SUCCESS; }
 int MPI_Comm_rank(MPI_Comm c, int *rank)        { (void)c; *rank = 0; return MPI_SUCCESS; }
 int MPI_Comm_size(MPI_Comm c, int *size)        { (void)c; *size = 1; return MPI_SUCCESS; }
 int MPI_Bcast(void *b, int n, MPI_Datatype t, int r, MPI_Comm c)
    { (void)b;(void)n;(void)t;(void)r;(void)c; return MPI_SUCCESS; }
 int MPI_Reduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, int root, MPI_Comm c) {
    (void)op;(void)root;(void)c;
    size_t sz = (t==MPI_DOUBLE)?sizeof(double):(t==MPI_INT)?sizeof(int):1;
    memcpy(r, s, (size_t)n * sz);
    return MPI_SUCCESS;
 }
 int MPI_Allreduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, MPI_Comm c)
    { return MPI_Reduce(s,r,n,t,op,0,c); }
 int MPI_Send(const void *b, int n, MPI_Datatype t, int d, int tag, MPI_Comm c)
    { (void)b;(void)n;(void)t;(void)d;(void)tag;(void)c; return MPI_SUCCESS; }
 int MPI_Recv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Status *st)
    { (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)st; return MPI_SUCCESS; }
 int MPI_Sendrecv(const void *sb, int sn, MPI_Datatype st2, int dest, int stag,
                 void *rb, int rn, MPI_Datatype rt, int src, int rtag,
                 MPI_Comm c, MPI_Status *status)
    { (void)sb;(void)sn;(void)st2;(void)dest;(void)stag;
      (void)rb;(void)rn;(void)rt;(void)src;(void)rtag;(void)c;(void)status;
      return MPI_SUCCESS; }
 int MPI_Irecv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Request *req)
    { (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)req; return MPI_SUCCESS; }
 int MPI_Wait(MPI_Request *req, MPI_Status *st)
    { (void)req;(void)st; return MPI_SUCCESS; }
 int MPI_Abort(MPI_Comm c, int code) { (void)c; exit(code); }
 double MPI_Wtime(void) {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
 }
 MPISTUB
 # Write Make.bee — HPL makefile configuration
 cat > Make.bee <<MAKEFILE
 SHELL        = /bin/sh
 CD           = cd
 CP           = cp
 LN_S         = ln -s
 MKDIR        = mkdir -p
 RM           = /bin/rm -f
 TOUCH        = touch
 ARCH         = bee
 # Directories
 TOPdir       = \$(shell pwd)
 INCdir       = \$(TOPdir)/include
 BINdir       = \$(TOPdir)/bin/\$(ARCH)
 LIBdir       = \$(TOPdir)/lib/\$(ARCH)
 HPLlib       = \$(LIBdir)/libhpl.a
 # Compiler
 CC           = gcc
 CCNOOPT      = \$(HPL_DEFS)
 CCFLAGS      = \$(HPL_DEFS) -O3 -march=native -funroll-loops -fomit-frame-pointer
 # Linker
 LINKER       = gcc
 LINKFLAGS    = \$(CCFLAGS)
 # MPI (single-process stub — no actual MPI needed)
 MPdir        =
 MPinc        = -I${BUILD_TMP}
 MPlib        = ${BUILD_TMP}/mpi_stub.o
 # BLAS (OpenBLAS)
 LAdir        = ${CACHE_DIR}/lib
 LAinc        =
 LAlib        = -L\$(LAdir) -Wl,-rpath,/usr/lib -lopenblas
 HPL_OPTS     =
 HPL_DEFS     = \$(HPL_OPTS) -DHPL_CALL_CBLAS
 MAKEFILE
 echo "=== Make.bee written ==="
 # compile MPI stub
 gcc -O2 -c -o "${BUILD_TMP}/mpi_stub.o" "${BUILD_TMP}/mpi_stub.c"
 # build HPL
 echo "=== building HPL ${HPL_VERSION} ==="
 make -j"$(nproc)" arch=bee 2>&1 | tail -20
 XHPL_BIN="bin/bee/xhpl"
 [ -x "${XHPL_BIN}" ] || { echo "ERROR: xhpl not found after build"; exit 1; }
 cp "${XHPL_BIN}" "${CACHE_DIR}/bin/xhpl"
 chmod +x "${CACHE_DIR}/bin/xhpl"
 echo "=== HPL build complete ==="
 echo "binary: ${CACHE_DIR}/bin/xhpl"
 echo "libs:   $(ls "${CACHE_DIR}/lib/")"
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -1148,19 +1148,6 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    echo "=== john injected ==="
 fi
 # --- build HPL (CPU LINPACK) — runs on all variants ---
 run_step "build HPL ${HPL_VERSION}" "80-hpl" \
    sh "${BUILDER_DIR}/build-hpl.sh" "${HPL_VERSION}" "${HPL_SHA256}" "${DIST_DIR}"
 HPL_CACHE="${DIST_DIR}/hpl-${HPL_VERSION}"
 mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee"
 cp "${HPL_CACHE}/bin/xhpl" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
 chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
 chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-hpl" 2>/dev/null || true
 # Inject OpenBLAS runtime libs needed by xhpl
 cp "${HPL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
 echo "=== HPL injected: xhpl + $(ls "${HPL_CACHE}/lib/" | wc -l) OpenBLAS libs ==="
 # --- embed build metadata ---
 mkdir -p "${OVERLAY_STAGE_DIR}/etc"
 BUILD_DATE="$(date +%Y-%m-%d)"
@@ -1193,7 +1180,6 @@ BUILD_DATE=${BUILD_DATE}
 GIT_COMMIT=${GIT_COMMIT}
 DEBIAN_VERSION=${DEBIAN_VERSION}
 DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
 HPL_VERSION=${HPL_VERSION}
 ${GPU_VERSION_LINE}
 EOF
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -11,18 +11,18 @@ echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 submenu "EASY-BEE (advanced options) -->" {
    menuentry "EASY-BEE — GSP=off" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — KMS (no nomodeset)" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
--- a/iso/builder/config/bootloaders/grub-pc/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/theme.cfg
@@ -1,9 +1,9 @@
 set color_normal=light-gray/black
-set color_highlight=white/dark-gray
+set color_highlight=yellow/black
 if [ -e /boot/grub/splash.png ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
-    set menu_color_normal=cyan/black
+    set menu_color_normal=yellow/black
-    set menu_color_highlight=white/dark-gray
+    set menu_color_highlight=white/brown
 fi
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -3,31 +3,31 @@ label live-@FLAVOUR@-normal
    menu default
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=normal
+    append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^graphics/KMS)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
+    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ toram bee.nvidia.mode=normal
+    append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 label live-@FLAVOUR@-kms-gsp-off
    menu label EASY-BEE (g^raphics/KMS, GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
+    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -25,6 +25,7 @@ ensure_bee_console_user() {
 ensure_bee_console_user
 # Enable common bee services
 systemctl enable bee-hpc-tuning.service
 systemctl enable bee-network.service
 systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
@@ -55,6 +56,7 @@ fi
 # nogpu: no GPU services needed
 # Ensure scripts are executable
 chmod +x /usr/local/bin/bee-hpc-tuning  2>/dev/null || true
 chmod +x /usr/local/bin/bee-network.sh  2>/dev/null || true
 chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -82,16 +82,22 @@ glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
 glow = glow.filter(ImageFilter.GaussianBlur(60))
 img = Image.alpha_composite(img.convert('RGBA'), glow)
-font_logo = load_font(MONO_FONT_CANDIDATES, 64)
+TARGET_LOGO_W = 400
 max_chars = max(len(line) for line in ASCII_ART)
 _probe_font = load_font(MONO_FONT_CANDIDATES, 64)
 _probe_cw, _ = mono_metrics(_probe_font)
 font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
 font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
 char_w, char_h = mono_metrics(font_logo)
-logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 8)
+logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
 logo_w, logo_h = logo_mask.size
 logo_x = (W - logo_w) // 2
-logo_y = 270
+logo_y = 380
-shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
+sh_off = max(1, font_size_logo // 6)
-img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
+shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
-img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
+img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
 img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
 img.paste(FG, (logo_x, logo_y), logo_mask)
 font_sub = load_font(SUB_FONT_CANDIDATES, 30)
--- a/iso/overlay/etc/systemd/system/bee-hpc-tuning.service
+++ b/iso/overlay/etc/systemd/system/bee-hpc-tuning.service
@@ -0,0 +1,14 @@
 [Unit]
 Description=Bee: HPC tuning (CPU governor, C-states)
 After=local-fs.target
 Before=bee-nvidia.service bee-audit.service
 [Service]
 Type=oneshot
 ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-hpc-tuning.log /usr/local/bin/bee-hpc-tuning
 StandardOutput=journal
 StandardError=journal
 RemainAfterExit=yes
 [Install]
 WantedBy=multi-user.target
--- a/iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
+++ b/iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
@@ -0,0 +1,110 @@
 #!/bin/sh
 set -eu
 SECONDS=300
 STAGGER_SECONDS=180
 DEVICES=""
 EXCLUDE=""
 usage() {
    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
    exit 2
 }
 normalize_list() {
    echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
 }
 contains_csv() {
    needle="$1"
    haystack="${2:-}"
    echo ",${haystack}," | grep -q ",${needle},"
 }
 resolve_dcgmproftester() {
    for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
        if command -v "${candidate}" >/dev/null 2>&1; then
            command -v "${candidate}"
            return 0
        fi
    done
    return 1
 }
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        *) usage ;;
    esac
 done
 PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
 ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
 [ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
 DEVICES=$(normalize_list "${DEVICES}")
 EXCLUDE=$(normalize_list "${EXCLUDE}")
 SELECTED="${DEVICES}"
 if [ -z "${SELECTED}" ]; then
    SELECTED="${ALL_DEVICES}"
 fi
 FINAL=""
 for id in $(echo "${SELECTED}" | tr ',' ' '); do
    [ -n "${id}" ] || continue
    if contains_csv "${id}" "${EXCLUDE}"; then
        continue
    fi
    if [ -z "${FINAL}" ]; then
        FINAL="${id}"
    else
        FINAL="${FINAL},${id}"
    fi
 done
 [ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
 echo "loader=dcgmproftester-staggered"
 echo "selected_gpus=${FINAL}"
 echo "stagger_seconds=${STAGGER_SECONDS}"
 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
 GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
 gpu_pos=0
 WORKERS=""
 for id in $(echo "${FINAL}" | tr ',' ' '); do
    gpu_pos=$((gpu_pos + 1))
    log="${TMP_DIR}/gpu-${id}.log"
    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
    gpu_seconds=$(( SECONDS + extra_sec ))
    echo "starting gpu ${id} seconds=${gpu_seconds}"
    CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
        sleep "${STAGGER_SECONDS}"
    fi
 done
 status=0
 for spec in ${WORKERS}; do
    pid=${spec%%:*}
    rest=${spec#*:}
    id=${rest%%:*}
    log=${rest#*:}
    if wait "${pid}"; then
        echo "gpu ${id} finished: OK"
    else
        rc=$?
        echo "gpu ${id} finished: FAILED rc=${rc}"
        status=1
    fi
    sed "s/^/[gpu ${id}] /" "${log}" || true
 done
 exit "${status}"
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -2,13 +2,14 @@
 set -eu
 SECONDS=5
 STAGGER_SECONDS=0
 SIZE_MB=0
 DEVICES=""
 EXCLUDE=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
 usage() {
-    echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
    exit 2
 }
@@ -25,6 +26,7 @@ contains_csv() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
@@ -61,14 +63,18 @@ done
 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"
 echo "stagger_seconds=${STAGGER_SECONDS}"
 export CUDA_DEVICE_ORDER="PCI_BUS_ID"
 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
 GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
 gpu_pos=0
 WORKERS=""
 for id in $(echo "${FINAL}" | tr ',' ' '); do
    gpu_pos=$((gpu_pos + 1))
    log="${TMP_DIR}/gpu-${id}.log"
    gpu_size_mb="${SIZE_MB}"
    if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
            gpu_size_mb=512
        fi
    fi
-    echo "starting gpu ${id} size=${gpu_size_mb}MB"
+    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
    gpu_seconds=$(( SECONDS + extra_sec ))
    echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
    CUDA_VISIBLE_DEVICES="${id}" \
-        "${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
        sleep "${STAGGER_SECONDS}"
    fi
 done
 status=0
--- a/iso/overlay/usr/local/bin/bee-hpc-tuning
+++ b/iso/overlay/usr/local/bin/bee-hpc-tuning
@@ -0,0 +1,41 @@
 #!/bin/sh
 # bee-hpc-tuning — apply HPC tuning for deterministic benchmarking
 # Called by bee-hpc-tuning.service at boot.
 log() { echo "[bee-hpc-tuning] $*"; }
 # ── CPU governor ────────────────────────────────────────────────────────────
 # Set all CPU cores to performance governor via sysfs.
 # cpupower is not available; write directly to scaling_governor.
 governor_ok=0
 governor_fail=0
 for gov_path in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
    [ -f "$gov_path" ] || continue
    if echo performance > "$gov_path" 2>/dev/null; then
        governor_ok=$((governor_ok + 1))
    else
        governor_fail=$((governor_fail + 1))
    fi
 done
 if [ "$governor_ok" -gt 0 ] && [ "$governor_fail" -eq 0 ]; then
    log "CPU governor set to performance on ${governor_ok} core(s)"
 elif [ "$governor_ok" -gt 0 ]; then
    log "WARN: CPU governor: ${governor_ok} OK, ${governor_fail} failed"
 elif [ "$governor_fail" -gt 0 ]; then
    log "WARN: failed to set CPU governor on ${governor_fail} core(s)"
 else
    log "WARN: no cpufreq scaling_governor paths found (C-state governor or HW-controlled)"
 fi
 # ── Transparent Huge Pages ───────────────────────────────────────────────────
 # Kernel cmdline sets transparent_hugepage=always at boot, but confirm and log.
 thp_path=/sys/kernel/mm/transparent_hugepage/enabled
 if [ -f "$thp_path" ]; then
    current=$(cat "$thp_path" 2>/dev/null)
    log "transparent_hugepage: ${current}"
 else
    log "WARN: transparent_hugepage sysfs path not found"
 fi
 log "done"
--- a/iso/overlay/usr/local/bin/bee-hpl
+++ b/iso/overlay/usr/local/bin/bee-hpl
@@ -1,97 +0,0 @@
 #!/bin/sh
 # bee-hpl — run HPL (High Performance LINPACK) with auto-sized problem.
 #
 # Generates HPL.dat based on available RAM, runs xhpl, and prints standard
 # HPL output. The WR... line with Gflops is parsed by the bee audit tool.
 #
 # Usage: bee-hpl [--mem-fraction 0.80] [--nb 256] [--seconds N]
 #
 # --mem-fraction   fraction of total RAM to use for the matrix (default 0.80)
 # --nb             block size; 256 is good for modern CPUs (default 256)
 # --seconds        ignored — HPL runtime is determined by problem size; kept
 #                  for interface compatibility with other bee stress tools
 set -eu
 XHPL="/usr/local/lib/bee/xhpl"
 MEM_FRACTION="0.80"
 NB=256
 usage() {
    echo "usage: $0 [--mem-fraction 0.80] [--nb 256] [--seconds N]" >&2
    exit 2
 }
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --mem-fraction) [ "$#" -ge 2 ] || usage; MEM_FRACTION="$2"; shift 2 ;;
        --nb)           [ "$#" -ge 2 ] || usage; NB="$2"; shift 2 ;;
        --seconds)      [ "$#" -ge 2 ] || usage; shift 2 ;;  # accepted, ignored
        *) usage ;;
    esac
 done
 [ -x "${XHPL}" ] || { echo "ERROR: xhpl not found at ${XHPL}" >&2; exit 1; }
 # Detect total RAM in bytes
 TOTAL_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}')
 [ -n "${TOTAL_KB}" ] || { echo "ERROR: cannot read MemTotal from /proc/meminfo" >&2; exit 1; }
 TOTAL_BYTES=$(( TOTAL_KB * 1024 ))
 # N = floor(sqrt(fraction * total_bytes / 8)) rounded down to multiple of NB
 # Use awk for floating-point sqrt
 N=$(awk -v total="${TOTAL_BYTES}" -v frac="${MEM_FRACTION}" -v nb="${NB}" '
 BEGIN {
    raw = int(sqrt(total * frac / 8.0))
    n   = int(raw / nb) * nb
    if (n < nb) n = nb
    print n
 }')
 echo "loader=bee-hpl"
 echo "total_ram_mb=$(( TOTAL_KB / 1024 ))"
 echo "matrix_n=${N}"
 echo "block_nb=${NB}"
 echo "mem_fraction=${MEM_FRACTION}"
 # Generate HPL.dat in a temp directory and run from there
 RUNDIR=$(mktemp -d)
 trap 'rm -rf "${RUNDIR}"' EXIT INT TERM
 cat > "${RUNDIR}/HPL.dat" <<DAT
 HPLinpack benchmark input file
 Innovative Computing Laboratory, University of Tennessee
 HPL.out        output file name (if any)
 6              device out (6=stdout, 7=stderr, file)
 1              # of problems sizes (N)
 ${N}           Ns
 1              # of NBs
 ${NB}          NBs
 0              PMAP process mapping (0=Row-,1=Column-major)
 1              # of process grids (P x Q)
 1              Ps
 1              Qs
 16.0           threshold
 1              # of panel fact
 2              PFACTs (0=left, 1=Crout, 2=Right)
 1              # of recursive stopping criterium
 4              NBMINs (>= 1)
 1              # of panels in recursion
 2              NDIVs
 1              # of recursive panel fact.
 1              RFACTs (0=left, 1=Crout, 2=Right)
 1              # of broadcast
 1              BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
 1              # of lookahead depth
 1              DEPTHs (>=0)
 2              SWAP (0=bin-exch,1=long,2=mix)
 64             swapping threshold
 0              L1 in (0=transposed,1=no-transposed) form
 0              U  in (0=transposed,1=no-transposed) form
 1              Equilibration (0=no,1=yes)
 8              memory alignment in double (> 0)
 DAT
 cd "${RUNDIR}"
 echo "---"
 "${XHPL}"
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -2,6 +2,7 @@
 set -eu
 DURATION_SEC=300
 STAGGER_SECONDS=0
 DEVICES=""
 EXCLUDE=""
 FORMAT=""
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
 export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
 usage() {
-    echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
    exit 2
 }
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -170,6 +172,7 @@ done
 echo "loader=john"
 echo "selected_gpus=${FINAL}"
 echo "john_devices=${JOHN_DEVICES}"
 echo "stagger_seconds=${STAGGER_SECONDS}"
 cd "${JOHN_DIR}"
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
 echo "format=${CHOSEN_FORMAT}"
 echo "target_seconds=${DURATION_SEC}"
 echo "slice_seconds=${TEST_SLICE_SECONDS}"
-DEADLINE=$(( $(date +%s) + DURATION_SEC ))
+TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
 _first=1
 pos=0
 for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
    pos=$((pos + 1))
    [ "${_first}" = "1" ] || sleep 3
    _first=0
-    run_john_loop "${opencl_id}" "${DEADLINE}" &
+    extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
    deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
    run_john_loop "${opencl_id}" "${deadline}" &
    pid=$!
    PIDS="${PIDS} ${pid}"
    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
        sleep "${STAGGER_SECONDS}"
    fi
 done
 FAIL=0
 for pid in ${PIDS}; do
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {
 log "kernel: $(uname -r)"
-# Skip if no NVIDIA GPU present (PCI vendor 10de)
+# Skip if no NVIDIA display/compute GPU is present.
-if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
+# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
 have_nvidia_gpu() {
    lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
 }
 if ! have_nvidia_gpu; then
    log "no NVIDIA GPU detected — skipping module load"
    exit 0
 fi
--- a/iso/overlay/usr/local/bin/bee-selfheal
+++ b/iso/overlay/usr/local/bin/bee-selfheal
@@ -14,7 +14,7 @@ log() {
 }
 have_nvidia_gpu() {
-    lspci -nn 2>/dev/null | grep -qi '10de:'
+    lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
 }
 service_active() {
Author	SHA1	Message	Date
Michael Chus	02e44b1172	Fix USB/RAM status checks; add server model+S/N to dashboard; remove cycles USB Export Drive: lsblk reports TRAN only for whole disks, not partitions (/dev/sdc1). Strip trailing partition digits to get parent disk before transport check. LiveCD in RAM: When RunInstallToRAM copies squashfs to /dev/shm/bee-live/ but bind-mount of /run/live/medium fails (CD-ROM boots), /run/live/medium still shows the CD-ROM fstype. Add fallback: if /dev/shm/bee-live/*.squashfs exists, the data is in RAM — report status OK. Dashboard Hardware Summary: Show server Manufacturer + ProductName as heading and S/N as subline above the component table, sourced from hw.Board (dmidecode system-type data). Validate: Remove Cycles input — always run once. cycles=1 hardcoded in runAllSAT(). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:46:42 +03:00
Michael Chus	2ceaa0d0ca	Include profile and mode in benchmark task names for task list clarity Task names now follow the pattern: NVIDIA Benchmark · <profile> · <mode> [· GPU <indices>] Examples: NVIDIA Benchmark · standard · sequential (GPU 0, RTX 6000 Pro) NVIDIA Benchmark · stability · parallel NVIDIA Benchmark · standard · ramp 1/4 · GPU 0 NVIDIA Benchmark · standard · ramp 2/4 · GPU 0,1 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:36:51 +03:00
Michael Chus	9482ba20a2	Remove NCCL checkbox — auto-enable interconnect step when >1 GPU selected NCCL all_reduce is always attempted when 2+ GPUs are selected; a failure leaves InterconnectScore=0 (no bonus, no penalty) and OverallStatus unaffected. Exposing the checkbox implied NCCL is optional and made a failed run look like a deliberate skip. - Remove benchmark-run-nccl checkbox and its change listener from pages.go - Client sends run_nccl: selected.length > 1 (automatic) - api.go default runNCCL=true is unchanged - Selection note now mentions NCCL automatically for multi-GPU runs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:33:17 +03:00
Michael Chus	813e2f86a9	Add scalability/ramp-up labeling, ServerPower penalty in scoring, and report improvements - Add RampStep/RampTotal/RampRunID to NvidiaBenchmarkOptions, taskParams, and NvidiaBenchmarkResult so ramp-up steps can be correlated across result.json files - Add ScalabilityScore field to NvidiaBenchmarkResult (placeholder; computed externally by comparing ramp-up step results sharing the same ramp_run_id) - Propagate ramp fields through api.go (generates shared ramp_run_id at spawn time), tasks.go handler, and benchmark.go result population - Apply ServerPower penalty to CompositeScore when IPMI reporting_ratio < 0.75: factor = ratio/0.75, applied per-GPU with a note explaining the reduction - Add finding when server power delta exceeds GPU-reported sum by >25% (non-GPU draw) - Report header now shows ramp step N/M and run ID instead of "parallel" when in ramp mode; shows scalability_score when non-zero Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:30:47 +03:00
Michael Chus	58a6da9b44	Recover power limits and SM count from nvidia-smi -q in enrichGPUInfo When --query-gpu CSV fields fail (exit status 2 on some Blackwell + driver combos), enrichGPUInfoWithMaxClocks now also parses from the verbose nvidia-smi -q output already collected at benchmark start: - Default Power Limit → DefaultPowerLimitW - Current Power Limit → PowerLimitW (fallback) - Multiprocessor Count → MultiprocessorCount Fixes PowerSustainScore=0 on systems where all three CSV query variants fail but nvidia-smi -q succeeds (confirmed on RTX PRO 6000 Blackwell + driver 590.48.01). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:17:56 +03:00
Michael Chus	f4a19c0a00	Add power calibration step to benchmark; fix PowerSustainScore reference Before the per-GPU compute phases, run `dcgmi diag -r targeted_power` for 45 s while collecting nvidia-smi power metrics in parallel. The p95 power per GPU is stored as calibrated_peak_power_w and used as the denominator for PowerSustainScore instead of the hardware default limit, which bee-gpu-burn cannot reach because it is compute-only. Fallback chain: calibrated peak → default limit → enforced limit. If dcgmi is absent or the run fails, calibration is skipped silently. Adjust composite score weights to match the new honest power reference: base 0.35, thermal 0.25, stability 0.25, power 0.15, NCCL bonus 0.10. Power weight reduced (0.20→0.15) because even with a calibrated reference bee-gpu-burn reaches ~60-75% of TDP by design (no concurrent mem stress). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:06:46 +03:00
Michael Chus	9e3dcf9b4d	Record host CPU/RAM config in benchmark results; check CPU load - BenchmarkHostConfig captures CPU model, sockets, cores, threads, and total RAM from /proc/cpuinfo and /proc/meminfo at benchmark start. - BenchmarkCPULoad samples host CPU utilisation every 10 s throughout the GPU steady-state phase (sequential and parallel paths). - Summarises avg/max/p95 and classifies status as ok / high / unstable. - Adds a finding when CPU load is elevated (avg >20% or max >40%) or erratic (stddev >12%), with a plain-English description in the report. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 20:02:04 +03:00
Michael Chus	098e19f760	Add ramp-up mode to NVIDIA GPU benchmark Adds a new checkbox (enabled by default) in the benchmark section. In ramp-up mode N tasks are spawned simultaneously: 1 GPU, then 2, then 3, up to all selected GPUs — each step runs its GPUs in parallel. NCCL runs only on the final step. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 18:34:19 +03:00
Michael Chus	e16d0f34b5	Adjust burn GPU ramp timing by profile	2026-04-12 15:58:30 +03:00
Mikhail Chusavitin	525ed8b8fc	Fix GPU clock lock normalization for Blackwell (clocks.max.* unsupported) clocks.max.graphics / clocks.max.memory CSV fields return exit status 2 on RTX PRO 6000 Blackwell (driver 98.x), causing the entire gpu inventory query to fail and clock lock to be skipped → normalization: partial. Fix: - Add minimal fallback query (index,uuid,name,pci.bus_id,vbios_version, power.limit) that succeeds even without clock fields - Add enrichGPUInfoWithMaxClocks: parses "Max Clocks" section of nvidia-smi -q verbose output to fill MaxGraphicsClockMHz / MaxMemoryClockMHz when CSV fields fail - Move nvidia-smi -q execution before queryBenchmarkGPUInfo so its output is available for clock enrichment immediately after - Tests: cover enrichment and skip-if-populated cases Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 13:33:54 +03:00
Mikhail Chusavitin	4f94ebcb2c	Add HPC tuning: PCIe ASPM off, C-states, performance CPU governor - grub.cfg + isolinux/live.cfg.in: add pcie_aspm=off, intel_idle.max_cstate=1 and processor.max_cstate=1 to all non-failsafe boot entries - bee-hpc-tuning: new script that sets all CPU cores to performance governor via sysfs and logs THP state at boot - bee-hpc-tuning.service: runs before bee-nvidia and bee-audit - 9000-bee-setup.hook.chroot: enable service and mark script executable Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 13:07:32 +03:00
Mikhail Chusavitin	05c1fde233	Warn on PCIe link speed degradation and collect lspci -vvv in techdump - collector/pcie: add applyPCIeLinkSpeedWarning that sets status=Warning and ErrorDescription when current link speed is below maximum negotiated speed (e.g. Gen1 running on a Gen5 slot) - collector/pcie: add pcieLinkSpeedRank helper for Gen string comparison - collector/pcie_filter_test: cover degraded and healthy link speed cases - platform/techdump: collect lspci -vvv → lspci-vvv.txt for LnkCap/LnkSta Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 12:42:17 +03:00
Michael Chus	825ef6b98a	Add USB export drive and LiveCD-in-RAM checks to Runtime Health - schema: add ToRAMStatus and USBExportPath fields to RuntimeHealth - platform/runtime.go: collectToRAMHealth (ok/warning/failed based on IsLiveMediaInRAM + toramActive) and collectUSBExportHealth (scans /proc/mounts + lsblk for writable USB-backed filesystems) - pages.go: add USB Export Drive and LiveCD in RAM rows to the health table Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-11 10:05:27 +03:00
Michael Chus	ba16021cdb	Fix GPU model propagation, export filenames, PSU/service status, and chart perf - nvidia.go: add Name field to nvidiaGPUInfo, include model name in nvidia-smi query, set dev.Model in enrichPCIeWithNVIDIAData - pages.go: fix duplicate GPU count in validate card summary (4 GPU: 4 x … → 4 x … GPU); fix PSU UNKNOWN fallback from hw.PowerSupplies; treat activating/deactivating/reloading service states as OK in Runtime Health - support_bundle.go: use "150405" time format (no colons) for exFAT compat - sat.go / benchmark.go / platform_stress.go / sat_fan_stress.go: remove .tar.gz archive creation from export dirs — export packs everything itself - charts_svg.go: add min-max downsampling (1400 pt cap) for SVG chart perf - benchmark_report.go / sat.go: normalize GPU fallback to "Unknown GPU" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-11 10:05:27 +03:00
Mikhail Chusavitin	bb1218ddd4	Fix GPU inventory: exclude BMC virtual VGA, show real NVIDIA model names Two issues: 1. BMC/management VGA chips (e.g. Huawei iBMC Hi171x, ASPEED) were included in GPU inventory because shouldIncludePCIeDevice only checked the PCI class, not the device name. Added a name-based filter for known BMC/management patterns when the class is VGA/display/3d. 2. New NVIDIA GPUs (e.g. RTX PRO 6000 Blackwell, device ID 2bb5) showed as "Device 2bb5" because lspci's database lags behind. Added "name" to the nvidia-smi query and use it to override dev.Model during enrichment. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-10 13:57:26 +03:00
Mikhail Chusavitin	65faae8ede	Remove hpl from SAT run-all targets — no backend route exists hpl was listed in baseTargets and stressOnlyTargets but /api/sat/hpl/run was never registered, causing a 405 Method Not Allowed (not valid JSON) error when Validate one by one was triggered in stress mode. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-10 13:30:32 +03:00
Michael Chus	05241f2e0e	Redesign dashboard: split Runtime Health and Hardware Summary - Runtime Health now shows only LiveCD system status (services, tools, drivers, network, CUDA/ROCm) — hardware component rows removed - Hardware Summary now shows server components with readable descriptions (model, count×size) and component-status.json health badges - Add Network Adapters row to Hardware Summary - SFP module static info (vendor, PN, SN, connector, type, wavelength) now collected via ethtool -m regardless of carrier state - PSU statuses from IPMI audit written to component-status.json so PSU badge shows actual status after first audit instead of UNKNOWN Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-09 23:41:23 +03:00
Mikhail Chusavitin	c1690a084b	Fix app tests that mutate global defaults	2026-04-09 15:28:25 +03:00
Mikhail Chusavitin	9481ca2805	Add staged NVIDIA burn ramp-up mode	2026-04-09 15:21:14 +03:00
Mikhail Chusavitin	a78fdadd88	Refine validate and burn profile layout	2026-04-09 15:14:48 +03:00
Mikhail Chusavitin	4ef403898f	Tighten NVIDIA GPU PCI detection	2026-04-09 15:14:48 +03:00
Michael Chus	025548ab3c	UI: amber accents, smaller wallpaper logo, new support bundle name, drop display resolution - Bootloader: GRUB fallback text colors → yellow/brown (amber tone) - CLI charts: all GPU metric series use single amber color (xterm-256 #214) - Wallpaper: logo width scaled to 400 px dynamically, shadow scales with font size - Support bundle: renamed to YYYY-MM-DD (BEE-SP vX.X) SRV_MODEL SRV_SN ToD.tar.gz using dmidecode for server model (spaces→underscores) and serial number - Remove display resolution feature (UI card, API routes, handlers, tests) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 21:37:01 +03:00
Mikhail Chusavitin	e0d94d7f47	Remove HPL from build and audit flows	2026-04-08 10:00:23 +03:00
Mikhail Chusavitin	13899aa864	Drop incompatible HPL git fallback	2026-04-08 09:50:58 +03:00
Mikhail Chusavitin	f345d8a89d	Build HPL serially to avoid upstream make races	2026-04-08 09:47:35 +03:00
Mikhail Chusavitin	4715059ac0	Fix HPL MPI stub header and keep full build logs	2026-04-08 09:45:14 +03:00
Mikhail Chusavitin	0660a40287	Harden HPL builder cache and runtime libs	2026-04-08 09:40:18 +03:00
Mikhail Chusavitin	67369d9b7b	Fix OpenBLAS package lookup in HPL build	2026-04-08 09:32:49 +03:00