Fix USB/RAM status checks; add server model+S/N to dashboard; remove cycles

USB Export Drive: lsblk reports TRAN only for whole disks, not partitions (/dev/sdc1). Strip trailing partition digits to get parent disk before transport check. LiveCD in RAM: When RunInstallToRAM copies squashfs to /dev/shm/bee-live/ but bind-mount of /run/live/medium fails (CD-ROM boots), /run/live/medium still shows the CD-ROM fstype. Add fallback: if /dev/shm/bee-live/*.squashfs exists, the data is in RAM — report status OK. Dashboard Hardware Summary: Show server Manufacturer + ProductName as heading and S/N as subline above the component table, sourced from hw.Board (dmidecode system-type data). Validate: Remove Cycles input — always run once. cycles=1 hardcoded in runAllSAT(). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Include profile and mode in benchmark task names for task list clarity
2026-04-12 22:46:42 +03:00 · 2026-04-12 22:36:51 +03:00 · 2026-04-12 22:33:17 +03:00 · 2026-04-12 22:30:47 +03:00 · 2026-04-12 22:17:56 +03:00 · 2026-04-12 22:06:46 +03:00
46 changed files with 2332 additions and 570 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -117,7 +117,7 @@ type satRunner interface {
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
-	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
@@ -190,6 +190,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 	}
 	result := collector.Run(runtimeMode)
 	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
+	writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -566,11 +567,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }

-func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
 }

 func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -926,6 +927,41 @@ func bodyOr(body, fallback string) string {
 	return body
 }

+// writePSUStatusesToDB records PSU statuses collected during audit into the
+// component-status DB so they are visible in the Hardware Summary card.
+// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
+func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
+	if db == nil || len(psus) == 0 {
+		return
+	}
+	const source = "audit:ipmi"
+	worstStatus := "OK"
+	for _, psu := range psus {
+		if psu.Status == nil {
+			continue
+		}
+		slot := "?"
+		if psu.Slot != nil {
+			slot = *psu.Slot
+		}
+		st := *psu.Status
+		detail := ""
+		if psu.ErrorDescription != nil {
+			detail = *psu.ErrorDescription
+		}
+		db.Record("psu:"+slot, source, st, detail)
+		switch st {
+		case "Critical":
+			worstStatus = "Critical"
+		case "Warning":
+			if worstStatus != "Critical" {
+				worstStatus = "Warning"
+			}
+		}
+	}
+	db.Record("psu:all", source, worstStatus, "")
+}
+
 func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
 	return f.runNvidiaFn(baseDir)
 }

-func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
 	if f.runNvidiaComputeFn != nil {
 		return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
 	}
@@ -542,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
 }

 func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -580,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 }

 func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -643,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 }

 func TestRunSATDefaultsToExportDir(t *testing.T) {
-	t.Parallel()
-
 	oldSATBaseDir := DefaultSATBaseDir
 	DefaultSATBaseDir = "/tmp/export/bee-sat"
 	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -54,7 +54,7 @@ if ! command -v lspci >/dev/null 2>&1; then
  exit 0
 fi
 found=0
-for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
+	for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
  found=1
  echo "=== GPU $gpu ==="
  lspci -s "$gpu" -vv 2>&1 || true
@@ -73,8 +73,13 @@ fi
 	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
 for d in /sys/bus/pci/devices/*/; do
  vendor=$(cat "$d/vendor" 2>/dev/null)
-  [ "$vendor" = "0x10de" ] || continue
-  dev=$(basename "$d")
+	  [ "$vendor" = "0x10de" ] || continue
+	  class=$(cat "$d/class" 2>/dev/null)
+	  case "$class" in
+	    0x030000|0x030200) ;;
+	    *) continue ;;
+	  esac
+	  dev=$(basename "$d")
  echo "=== $dev ==="
  for f in current_link_speed current_link_width max_link_speed max_link_width; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
@@ -192,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
 }

-const supportBundleGlob = "bee-support-*.tar.gz"
+const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"

 func BuildSupportBundle(exportDir string) (string, error) {
 	exportDir = strings.TrimSpace(exportDir)
@@ -206,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	host := sanitizeFilename(hostnameOr("unknown"))
-	ts := time.Now().UTC().Format("20060102-150405")
-	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
+	now := time.Now().UTC()
+	date := now.Format("2006-01-02")
+	tod := now.Format("150405")
+	ver := bundleVersion()
+	model := serverModelForBundle()
+	sn := serverSerialForBundle()
+
+	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
 		return "", err
 	}
@@ -240,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
+	archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
+	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
 	}
@@ -397,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	return os.WriteFile(dst, []byte(body.String()), 0644)
 }

+func bundleVersion() string {
+	v := buildVersion()
+	v = strings.TrimPrefix(v, "v")
+	v = strings.TrimPrefix(v, "V")
+	if v == "" || v == "unknown" {
+		return "0.0"
+	}
+	return v
+}
+
+func serverModelForBundle() string {
+	raw, err := exec.Command("dmidecode", "-t", "1").Output()
+	if err != nil {
+		return "unknown"
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		key, val, ok := strings.Cut(line, ": ")
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(key) == "Product Name" {
+			val = strings.TrimSpace(val)
+			if val == "" {
+				return "unknown"
+			}
+			return strings.ReplaceAll(val, " ", "_")
+		}
+	}
+	return "unknown"
+}
+
+func serverSerialForBundle() string {
+	raw, err := exec.Command("dmidecode", "-t", "1").Output()
+	if err != nil {
+		return "unknown"
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		key, val, ok := strings.Cut(line, ": ")
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(key) == "Serial Number" {
+			val = strings.TrimSpace(val)
+			if val == "" {
+				return "unknown"
+			}
+			return val
+		}
+	}
+	return "unknown"
+}
+
 func buildVersion() string {
 	raw, err := exec.Command("bee", "version").CombinedOutput()
 	if err != nil {
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
 	defer cancel()
 	return exec.CommandContext(ctx, name, args...).Output()
 }
-
-func interfaceHasCarrier(iface string) bool {
-	raw, err := readNetCarrierFile(iface)
-	if err != nil {
-		return false
-	}
-	return strings.TrimSpace(raw) == "1"
-}
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}

-		if interfaceHasCarrier(iface) {
-			if out, err := ethtoolModuleQuery(iface); err == nil {
-				if injectSFPDOMTelemetry(&devs[i], out) {
-					enriched++
-					continue
-				}
+		if out, err := ethtoolModuleQuery(iface); err == nil {
+			if injectSFPDOMTelemetry(&devs[i], out) {
+				enriched++
+				continue
 			}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 		}
 		key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
 		val := strings.TrimSpace(trimmed[idx+1:])
+		if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
+			continue
+		}

 		switch {
+		case key == "identifier":
+			s := parseSFPIdentifier(val)
+			dev.SFPIdentifier = &s
+			t := true
+			dev.SFPPresent = &t
+			changed = true
+		case key == "connector":
+			s := parseSFPConnector(val)
+			dev.SFPConnector = &s
+			changed = true
+		case key == "vendor name":
+			s := strings.TrimSpace(val)
+			dev.SFPVendor = &s
+			changed = true
+		case key == "vendor pn":
+			s := strings.TrimSpace(val)
+			dev.SFPPartNumber = &s
+			changed = true
+		case key == "vendor sn":
+			s := strings.TrimSpace(val)
+			dev.SFPSerialNumber = &s
+			changed = true
+		case strings.Contains(key, "laser wavelength"):
+			if f, ok := firstFloat(val); ok {
+				dev.SFPWavelengthNM = &f
+				changed = true
+			}
 		case strings.Contains(key, "module temperature"):
 			if f, ok := firstFloat(val); ok {
 				dev.SFPTemperatureC = &f
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 	return changed
 }

+// parseSFPIdentifier extracts the human-readable transceiver type from the
+// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
+func parseSFPIdentifier(val string) string {
+	if s := extractParens(val); s != "" {
+		return s
+	}
+	return val
+}
+
+// parseSFPConnector extracts the connector type from the raw ethtool line,
+// e.g. "0x07 (LC)" → "LC".
+func parseSFPConnector(val string) string {
+	if s := extractParens(val); s != "" {
+		return s
+	}
+	return val
+}
+
+var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
+
+func extractParens(s string) string {
+	m := parenRe.FindStringSubmatch(s)
+	if len(m) < 2 {
+		return ""
+	}
+	return strings.TrimSpace(m[1])
+}
+
 func parseSFPDOM(raw string) map[string]any {
 	dev := schema.HardwarePCIeDevice{}
 	if !injectSFPDOMTelemetry(&dev, raw) {
 		return map[string]any{}
 	}
 	out := map[string]any{}
+	if dev.SFPPresent != nil {
+		out["sfp_present"] = *dev.SFPPresent
+	}
+	if dev.SFPIdentifier != nil {
+		out["sfp_identifier"] = *dev.SFPIdentifier
+	}
+	if dev.SFPConnector != nil {
+		out["sfp_connector"] = *dev.SFPConnector
+	}
+	if dev.SFPVendor != nil {
+		out["sfp_vendor"] = *dev.SFPVendor
+	}
+	if dev.SFPPartNumber != nil {
+		out["sfp_part_number"] = *dev.SFPPartNumber
+	}
+	if dev.SFPSerialNumber != nil {
+		out["sfp_serial_number"] = *dev.SFPSerialNumber
+	}
+	if dev.SFPWavelengthNM != nil {
+		out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
+	}
 	if dev.SFPTemperatureC != nil {
 		out["sfp_temperature_c"] = *dev.SFPTemperatureC
 	}
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
 	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
 	readNetCarrierFile = func(string) (string, error) { return "0", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
-	ethtoolModuleQuery = func(string) (string, error) {
-		t.Fatal("ethtool -m should not be called without carrier")
-		return "", nil
-	}
+	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }

 	class := "EthernetController"
 	bdf := "0000:18:00.0"
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -15,6 +15,7 @@ const nvidiaVendorID = 0x10de
 type nvidiaGPUInfo struct {
 	Index              int
 	BDF                string
+	Name               string
 	Serial             string
 	VBIOS              string
 	TemperatureC       *float64
@@ -73,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 			continue
 		}

+		if v := strings.TrimSpace(info.Name); v != "" {
+			devs[i].Model = &v
+		}
 		if v := strings.TrimSpace(info.Serial); v != "" {
 			devs[i].SerialNumber = &v
 		}
@@ -99,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
 	out, err := exec.Command(
 		"nvidia-smi",
-		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
+		"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
@@ -123,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		if len(rec) == 0 {
 			continue
 		}
-		if len(rec) < 13 {
-			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
+		if len(rec) < 14 {
+			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
 		}

 		bdf := normalizePCIeBDF(rec[1])
@@ -135,17 +139,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		info := nvidiaGPUInfo{
 			Index:              parseRequiredInt(rec[0]),
 			BDF:                bdf,
-			Serial:             strings.TrimSpace(rec[2]),
-			VBIOS:              strings.TrimSpace(rec[3]),
-			TemperatureC:       parseMaybeFloat(rec[4]),
-			PowerW:             parseMaybeFloat(rec[5]),
-			ECCUncorrected:     parseMaybeInt64(rec[6]),
-			ECCCorrected:       parseMaybeInt64(rec[7]),
-			HWSlowdown:         parseMaybeBool(rec[8]),
-			PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
-			PCIeLinkGenMax:     parseMaybeInt(rec[10]),
-			PCIeLinkWidthCur:   parseMaybeInt(rec[11]),
-			PCIeLinkWidthMax:   parseMaybeInt(rec[12]),
+			Name:               strings.TrimSpace(rec[2]),
+			Serial:             strings.TrimSpace(rec[3]),
+			VBIOS:              strings.TrimSpace(rec[4]),
+			TemperatureC:       parseMaybeFloat(rec[5]),
+			PowerW:             parseMaybeFloat(rec[6]),
+			ECCUncorrected:     parseMaybeInt64(rec[7]),
+			ECCCorrected:       parseMaybeInt64(rec[8]),
+			HWSlowdown:         parseMaybeBool(rec[9]),
+			PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
+			PCIeLinkGenMax:     parseMaybeInt(rec[11]),
+			PCIeLinkWidthCur:   parseMaybeInt(rec[12]),
+			PCIeLinkWidthMax:   parseMaybeInt(rec[13]),
 		}
 		result[bdf] = info
 	}
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -6,7 +6,7 @@ import (
 )

 func TestParseNVIDIASMIQuery(t *testing.T) {
-	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
+	raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
 	byBDF, err := parseNVIDIASMIQuery(raw)
 	if err != nil {
 		t.Fatalf("parse failed: %v", err)
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
 	if !ok {
 		t.Fatalf("gpu by normalized bdf not found")
 	}
+	if gpu.Name != "NVIDIA H100 80GB HBM3" {
+		t.Fatalf("name: got %q", gpu.Name)
+	}
 	if gpu.Serial != "GPU-SERIAL-1" {
 		t.Fatalf("serial: got %q", gpu.Serial)
 	}
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -2,6 +2,7 @@ package collector

 import (
 	"bee/audit/internal/schema"
+	"fmt"
 	"log/slog"
 	"os/exec"
 	"strconv"
@@ -79,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
 		}
 	}

+	// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
+	// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
+	if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
+		bmcPatterns := []string{
+			"management system chip",
+			"management controller",
+			"ibmc",
+			"idrac",
+			"ilo vga",
+			"aspeed",
+			"matrox",
+		}
+		for _, bad := range bmcPatterns {
+			if strings.Contains(d, bad) {
+				return false
+			}
+		}
+	}
+
 	if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
 		internalAMDPatterns := []string{
 			"dummy function",
@@ -153,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {

 	// SVendor/SDevice available but not in schema — skip

+	// Warn if PCIe link is running below its maximum negotiated speed.
+	applyPCIeLinkSpeedWarning(&dev)
+
 	return dev
 }

@@ -222,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
 	return value, true
 }

+// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
+// speed is below the maximum negotiated speed supported by both ends.
+func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
+	if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
+		return
+	}
+	if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
+		warn := statusWarning
+		dev.Status = &warn
+		desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
+		dev.ErrorDescription = &desc
+	}
+}
+
+// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
+// Returns 0 for unrecognised values so comparisons fail safe.
+func pcieLinkSpeedRank(gen string) int {
+	switch gen {
+	case "Gen1":
+		return 1
+	case "Gen2":
+		return 2
+	case "Gen3":
+		return 3
+	case "Gen4":
+		return 4
+	case "Gen5":
+		return 5
+	case "Gen6":
+		return 6
+	default:
+		return 0
+	}
+}
+
 func normalizePCILinkSpeed(raw string) string {
 	raw = strings.TrimSpace(strings.ToLower(raw))
 	switch {
--- a/audit/internal/collector/pcie_filter_test.go
+++ b/audit/internal/collector/pcie_filter_test.go
@@ -1,6 +1,7 @@
 package collector

 import (
+	"bee/audit/internal/schema"
 	"encoding/json"
 	"strings"
 	"testing"
@@ -29,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
 		{name: "raid", class: "RAID bus controller", want: true},
 		{name: "nvme", class: "Non-Volatile memory controller", want: true},
 		{name: "vga", class: "VGA compatible controller", want: true},
+		{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
+		{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
 		{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
 	}

@@ -139,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
 		}
 	}
 }
+
+func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
+	ptr := func(s string) *string { return &s }
+
+	tests := []struct {
+		name        string
+		linkSpeed   *string
+		maxSpeed    *string
+		wantWarning bool
+		wantGenIn   string // substring expected in ErrorDescription when warning
+	}{
+		{
+			name:        "degraded Gen1 vs Gen5",
+			linkSpeed:   ptr("Gen1"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: true,
+			wantGenIn:   "Gen1",
+		},
+		{
+			name:        "at max Gen5",
+			linkSpeed:   ptr("Gen5"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: false,
+		},
+		{
+			name:        "degraded Gen4 vs Gen5",
+			linkSpeed:   ptr("Gen4"),
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: true,
+			wantGenIn:   "Gen4",
+		},
+		{
+			name:        "missing current speed — no warning",
+			linkSpeed:   nil,
+			maxSpeed:    ptr("Gen5"),
+			wantWarning: false,
+		},
+		{
+			name:        "missing max speed — no warning",
+			linkSpeed:   ptr("Gen1"),
+			maxSpeed:    nil,
+			wantWarning: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			dev := schema.HardwarePCIeDevice{}
+			ok := statusOK
+			dev.Status = &ok
+			dev.LinkSpeed = tt.linkSpeed
+			dev.MaxLinkSpeed = tt.maxSpeed
+
+			applyPCIeLinkSpeedWarning(&dev)
+
+			gotWarn := dev.Status != nil && *dev.Status == statusWarning
+			if gotWarn != tt.wantWarning {
+				t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
+			}
+			if tt.wantWarning {
+				if dev.ErrorDescription == nil {
+					t.Fatal("expected ErrorDescription to be set")
+				}
+				if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
+					t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
+				}
+			} else {
+				if dev.ErrorDescription != nil {
+					t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
+				}
+			}
+		})
+	}
+}
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"math"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"regexp"
 	"sort"
@@ -108,7 +109,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		ServerModel:        readServerModel(),
 		BenchmarkProfile:   spec.Name,
 		ParallelGPUs:       opts.ParallelGPUs,
+		RampStep:           opts.RampStep,
+		RampTotal:          opts.RampTotal,
+		RampRunID:          opts.RampRunID,
 		SelectedGPUIndices: append([]int(nil), selected...),
+		HostConfig:         readBenchmarkHostConfig(),
 		Normalization: BenchmarkNormalization{
 			Status: "full",
 		},
@@ -121,15 +126,22 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	var serverIdleOK, serverLoadedOK bool
 	var serverLoadedSamples int

+	// Run nvidia-smi -q first: used both for the log file and as a fallback
+	// source of max clock values when CSV clock fields are unsupported.
+	var nvsmiQOut []byte
+	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
+		nvsmiQOut = out
+		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
+	}
+
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
 		result.Normalization.Status = "partial"
 	}
-
-	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
-		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
-	}
+	// Enrich with max clocks from verbose output — covers GPUs where
+	// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
+	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)

 	activeApps, err := queryActiveComputeApps(selected)
 	if err == nil && len(activeApps) > 0 {
@@ -145,8 +157,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()

+	// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
+	// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
+	calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
+
+	// Start background CPU load sampler — samples every 10s during GPU phases.
+	cpuStopCh := make(chan struct{})
+	cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
+
 	if opts.ParallelGPUs {
-		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
+		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
 	} else {

 	for _, idx := range selected {
@@ -166,6 +186,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
+		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
+			gpuResult.CalibratedPeakPowerW = w
+		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -303,6 +326,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}

+	// Stop CPU load sampler and attach results.
+	close(cpuStopCh)
+	if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
+		result.CPULoad = summarizeCPULoad(cpuSamples)
+		if result.CPULoad != nil && result.CPULoad.Status != "ok" {
+			logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
+				result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
+		}
+	}
+
 	// Compute server power characterization from accumulated IPMI samples.
 	var gpuReportedSumW float64
 	for _, gpu := range result.GPUs {
@@ -314,6 +347,20 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	}
 	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)

+	// Apply server-power penalty when IPMI reports the server delta is much
+	// lower than GPU-reported sum: GPU power telemetry is over-stated, making
+	// CalibratedPeakPowerW and PowerSustainScore unreliable.
+	// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
+	if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
+		factor := sp.ReportingRatio / 0.75
+		for i := range result.GPUs {
+			result.GPUs[i].Scores.CompositeScore *= factor
+			result.GPUs[i].Notes = append(result.GPUs[i].Notes,
+				fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
+					sp.ReportingRatio, factor*100))
+		}
+	}
+
 	result.Findings = buildBenchmarkFindings(result)
 	result.OverallStatus = benchmarkOverallStatus(result)

@@ -335,11 +382,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		return "", fmt.Errorf("write summary.txt: %w", err)
 	}

-	archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", fmt.Errorf("pack benchmark archive: %w", err)
-	}
-	return archive, nil
+	return runDir, nil
 }

 func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
@@ -374,9 +417,13 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 // Fields are tried in order; the first successful query wins. Extended fields
 // (attribute.multiprocessor_count, power.default_limit) are not supported on
 // all driver versions, so we fall back to the base set if the full query fails.
+// The minimal fallback omits clock fields entirely — clocks.max.* returns
+// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
+// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
 var benchmarkGPUInfoQueries = []struct {
 	fields   string
 	extended bool // whether this query includes optional extended fields
+	minimal  bool // clock fields omitted; max clocks must be filled separately
 }{
 	{
 		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
@@ -386,6 +433,104 @@ var benchmarkGPUInfoQueries = []struct {
 		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
 		extended: false,
 	},
+	{
+		fields:  "index,uuid,name,pci.bus_id,vbios_version,power.limit",
+		minimal: true,
+	},
+}
+
+// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
+// any GPU in infoByIndex where those values are still zero.  It parses the
+// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
+// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
+// return exit status 2 but the verbose query works fine.
+func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
+	if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
+		return
+	}
+
+	// Build bus_id → index map for matching verbose sections to GPU indices.
+	busToBenchIdx := make(map[string]int, len(infoByIndex))
+	for idx, info := range infoByIndex {
+		if info.BusID != "" {
+			// nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain),
+			// while --query-gpu returns the same format; normalise to lower.
+			busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx
+		}
+	}
+
+	// Split the verbose output into per-GPU sections on "^GPU " lines.
+	gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
+	maxGfxRe      := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
+	maxMemRe      := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
+	defaultPwrRe  := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
+	currentPwrRe  := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
+	smCountRe     := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
+
+	sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
+	for i, loc := range sectionStarts {
+		busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]]))
+		benchIdx, ok := busToBenchIdx[busID]
+		if !ok {
+			// Bus IDs from verbose output may have a different domain prefix;
+			// try suffix match on the slot portion (XX:XX.X).
+			for k, v := range busToBenchIdx {
+				if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) {
+					benchIdx = v
+					ok = true
+					break
+				}
+			}
+		}
+		if !ok {
+			continue
+		}
+
+		end := len(nvsmiQ)
+		if i+1 < len(sectionStarts) {
+			end = sectionStarts[i+1][0]
+		}
+		section := nvsmiQ[loc[0]:end]
+
+		info := infoByIndex[benchIdx]
+
+		if info.MaxGraphicsClockMHz == 0 {
+			if m := maxGfxRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
+					info.MaxGraphicsClockMHz = v
+				}
+			}
+		}
+		if info.MaxMemoryClockMHz == 0 {
+			if m := maxMemRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
+					info.MaxMemoryClockMHz = v
+				}
+			}
+		}
+		if info.DefaultPowerLimitW == 0 {
+			if m := defaultPwrRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
+					info.DefaultPowerLimitW = v
+				}
+			}
+		}
+		if info.PowerLimitW == 0 {
+			if m := currentPwrRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
+					info.PowerLimitW = v
+				}
+			}
+		}
+		if info.MultiprocessorCount == 0 {
+			if m := smCountRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
+					info.MultiprocessorCount = v
+				}
+			}
+		}
+		infoByIndex[benchIdx] = info
+	}
 }

 func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
@@ -413,9 +558,13 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
 			continue
 		}

+		minFields := 6
+		if !q.minimal {
+			minFields = 9
+		}
 		infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
 		for _, row := range rows {
-			if len(row) < 9 {
+			if len(row) < minFields {
 				continue
 			}
 			idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
@@ -423,24 +572,26 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
 				continue
 			}
 			info := benchmarkGPUInfo{
-				Index:               idx,
-				UUID:                strings.TrimSpace(row[1]),
-				Name:                strings.TrimSpace(row[2]),
-				BusID:               strings.TrimSpace(row[3]),
-				VBIOS:               strings.TrimSpace(row[4]),
-				PowerLimitW:         parseBenchmarkFloat(row[5]),
-				MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
-				MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
+				Index:       idx,
+				UUID:        strings.TrimSpace(row[1]),
+				Name:        strings.TrimSpace(row[2]),
+				BusID:       strings.TrimSpace(row[3]),
+				VBIOS:       strings.TrimSpace(row[4]),
+				PowerLimitW: parseBenchmarkFloat(row[5]),
 			}
-			if len(row) >= 9 {
-				info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
-			}
-			if q.extended {
-				if len(row) >= 10 {
-					info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
+			if !q.minimal {
+				info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6])
+				info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7])
+				if len(row) >= 9 {
+					info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
 				}
-				if len(row) >= 11 {
-					info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
+				if q.extended {
+					if len(row) >= 10 {
+						info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
+					}
+					if len(row) >= 11 {
+						info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
+					}
 				}
 			}
 			infoByIndex[idx] = info
@@ -744,14 +895,22 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 			score.ComputeScore += precision.TeraOpsPerSec
 		}
 	}
-	// Use default power limit for sustain score so a manually reduced limit
-	// does not inflate the score. Fall back to enforced limit if default unknown.
-	referencePowerW := gpu.DefaultPowerLimitW
-	if referencePowerW <= 0 {
-		referencePowerW = gpu.PowerLimitW
-	}
-	if referencePowerW > 0 {
-		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
+	// PowerSustainScore: measures how close the GPU came to its rated TDP under
+	// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
+	// Penalty applied symmetrically for both under- and over-TDP deviations:
+	//   score = max(0, 100 − |measured − rated| / rated × 100)
+	// Under-TDP → power delivery / cooling issue.
+	// Over-TDP  → power limit not properly enforced / power regulation fault.
+	// Falls back to 0 if calibration was not performed (dcgmi unavailable).
+	{
+		ref := gpu.DefaultPowerLimitW
+		if ref <= 0 {
+			ref = gpu.PowerLimitW
+		}
+		if gpu.CalibratedPeakPowerW > 0 && ref > 0 {
+			deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100
+			score.PowerSustainScore = clampScore(100 - deviationPct)
+		}
 	}
 	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
 	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
@@ -765,7 +924,15 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 }

 func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
-	quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
+	// Weights after introducing calibrated power reference:
+	//   base        0.35 — floor so a GPU that fails all sustain checks still scores
+	//   thermal     0.25 — heaviest: throttle counters are the most reliable signal
+	//   stability   0.25 — clock/power variance matters for reproducibility
+	//   power       0.15 — GPU reaches rated TDP under targeted_power? lower weight
+	//                       because calibration may be absent (dcgmi not installed)
+	//   NCCL bonus  0.10 — interconnect health
+	//   cap         1.10
+	quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
 	if score.InterconnectScore > 0 {
 		quality += 0.10
 	}
@@ -985,16 +1152,57 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 				gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
 			))
 		}
+		// Flag significant TDP deviation (over or under) from calibration.
+		if gpu.CalibratedPeakPowerW > 0 {
+			ref := gpu.DefaultPowerLimitW
+			if ref <= 0 {
+				ref = gpu.PowerLimitW
+			}
+			if ref > 0 {
+				deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100
+				switch {
+				case deviationPct < -10:
+					findings = append(findings, fmt.Sprintf(
+						"GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.",
+						gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref,
+					))
+				case deviationPct > 5:
+					findings = append(findings, fmt.Sprintf(
+						"GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.",
+						gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct,
+					))
+				}
+			}
+		}
 	}
 	if result.Interconnect != nil && result.Interconnect.Supported {
 		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
 	}
+	if cl := result.CPULoad; cl != nil {
+		switch cl.Status {
+		case "high":
+			findings = append(findings, fmt.Sprintf(
+				"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
+				cl.AvgPct, cl.MaxPct,
+			))
+		case "unstable":
+			findings = append(findings, fmt.Sprintf(
+				"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
+				cl.AvgPct, cl.P95Pct,
+			))
+		}
+	}
 	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
 		if sp.ReportingRatio < 0.75 {
 			findings = append(findings, fmt.Sprintf(
-				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
+				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
 				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
 			))
+		} else if sp.ReportingRatio > 1.25 {
+			findings = append(findings, fmt.Sprintf(
+				"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
+				sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
+			))
 		}
 	}
 	return dedupeStrings(findings)
@@ -1299,6 +1507,7 @@ func runNvidiaBenchmarkParallel(
 	spec benchmarkProfileSpec,
 	logFunc func(string),
 	result *NvidiaBenchmarkResult,
+	calibPowerByIndex map[int]float64,
 	serverIdleW *float64, serverLoadedWSum *float64,
 	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
 ) {
@@ -1320,6 +1529,9 @@ func runNvidiaBenchmarkParallel(
 			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
+		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
+			r.CalibratedPeakPowerW = w
+		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -1481,3 +1693,225 @@ func runNvidiaBenchmarkParallel(
 		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
 	}
 }
+
+// readBenchmarkHostConfig reads static CPU and memory configuration from
+// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
+func readBenchmarkHostConfig() *BenchmarkHostConfig {
+	cfg := &BenchmarkHostConfig{}
+	populated := false
+
+	// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
+	if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
+		socketIDs := map[string]struct{}{}
+		coresPerSocket := map[string]int{}
+		var modelName string
+		threads := 0
+		for _, line := range strings.Split(string(data), "\n") {
+			kv := strings.SplitN(line, ":", 2)
+			if len(kv) != 2 {
+				continue
+			}
+			key := strings.TrimSpace(kv[0])
+			val := strings.TrimSpace(kv[1])
+			switch key {
+			case "processor":
+				threads++
+			case "model name":
+				if modelName == "" {
+					modelName = val
+				}
+			case "physical id":
+				socketIDs[val] = struct{}{}
+			case "cpu cores":
+				// Overwrite per-socket core count (last wins per socket, but all
+				// entries for the same socket report the same value).
+				if physLine := ""; physLine == "" {
+					// We accumulate below by treating cpu cores as a per-thread
+					// field; sum by socket requires a two-pass approach. Use the
+					// simpler approximation: totalCores = threads / (threads per core).
+					_ = val
+				}
+			}
+		}
+		// Second pass: per-socket core count.
+		var curSocket string
+		for _, line := range strings.Split(string(data), "\n") {
+			kv := strings.SplitN(line, ":", 2)
+			if len(kv) != 2 {
+				continue
+			}
+			key := strings.TrimSpace(kv[0])
+			val := strings.TrimSpace(kv[1])
+			switch key {
+			case "physical id":
+				curSocket = val
+			case "cpu cores":
+				if curSocket != "" {
+					if _, seen := coresPerSocket[curSocket]; !seen {
+						v, _ := strconv.Atoi(val)
+						coresPerSocket[curSocket] = v
+					}
+				}
+			}
+		}
+		totalCores := 0
+		for _, c := range coresPerSocket {
+			totalCores += c
+		}
+		cfg.CPUModel = modelName
+		cfg.CPUSockets = len(socketIDs)
+		if cfg.CPUSockets == 0 && threads > 0 {
+			cfg.CPUSockets = 1
+		}
+		cfg.CPUCores = totalCores
+		cfg.CPUThreads = threads
+		if modelName != "" || threads > 0 {
+			populated = true
+		}
+	}
+
+	// Parse /proc/meminfo for total physical RAM.
+	if data, err := os.ReadFile("/proc/meminfo"); err == nil {
+		for _, line := range strings.Split(string(data), "\n") {
+			if strings.HasPrefix(line, "MemTotal:") {
+				fields := strings.Fields(line)
+				if len(fields) >= 2 {
+					kb, _ := strconv.ParseUint(fields[1], 10, 64)
+					cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
+					populated = true
+				}
+				break
+			}
+		}
+	}
+
+	if !populated {
+		return nil
+	}
+	return cfg
+}
+
+// startCPULoadSampler starts a goroutine that samples host CPU load every
+// intervalSec seconds until stopCh is closed, then sends the collected
+// samples on the returned channel.
+func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
+	ch := make(chan []float64, 1)
+	go func() {
+		var samples []float64
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- samples
+				return
+			case <-ticker.C:
+				if pct := sampleCPULoadPct(); pct > 0 {
+					samples = append(samples, pct)
+				}
+			}
+		}
+	}()
+	return ch
+}
+
+// summarizeCPULoad computes stats over sampled CPU load values and assigns
+// a health status.
+func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
+	if len(samples) == 0 {
+		return nil
+	}
+	sorted := append([]float64(nil), samples...)
+	sort.Float64s(sorted)
+	var sum float64
+	for _, v := range sorted {
+		sum += v
+	}
+	avg := sum / float64(len(sorted))
+	p95 := sorted[int(float64(len(sorted))*0.95)]
+	max := sorted[len(sorted)-1]
+
+	cl := &BenchmarkCPULoad{
+		AvgPct:  math.Round(avg*10) / 10,
+		MaxPct:  math.Round(max*10) / 10,
+		P95Pct:  math.Round(p95*10) / 10,
+		Samples: len(sorted),
+	}
+
+	// Compute standard deviation to detect instability.
+	var variance float64
+	for _, v := range sorted {
+		d := v - avg
+		variance += d * d
+	}
+	stdDev := math.Sqrt(variance / float64(len(sorted)))
+
+	switch {
+	case avg > 20 || max > 40:
+		cl.Status = "high"
+		cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
+	case stdDev > 12:
+		cl.Status = "unstable"
+		cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
+	default:
+		cl.Status = "ok"
+	}
+	return cl
+}
+
+// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
+// collecting nvidia-smi power samples in parallel. It returns a map from GPU
+// index to p95 observed power (watts), which is used as the reference for
+// PowerSustainScore instead of the hardware default limit.
+//
+// If dcgmi is unavailable or the run fails the function returns an empty map
+// and the caller falls back to DefaultPowerLimitW. The calibration is skipped
+// gracefully — it must never block or fail the main benchmark.
+func runBenchmarkPowerCalibration(
+	ctx context.Context,
+	verboseLog, runDir string,
+	gpuIndices []int,
+	logFunc func(string),
+) map[int]float64 {
+	const calibDurationSec = 45
+
+	// dcgmi must be present.
+	if _, err := exec.LookPath("dcgmi"); err != nil {
+		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
+		return map[int]float64{}
+	}
+
+	logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
+
+	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
+	out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
+	if err != nil {
+		logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
+		return map[int]float64{}
+	}
+
+	// Group rows by GPU index and compute p95 power for each.
+	result := make(map[int]float64, len(gpuIndices))
+	for _, idx := range gpuIndices {
+		perGPU := filterRowsByGPU(rows, idx)
+		if len(perGPU) == 0 {
+			continue
+		}
+		powers := make([]float64, 0, len(perGPU))
+		for _, r := range perGPU {
+			if r.PowerW > 0 {
+				powers = append(powers, r.PowerW)
+			}
+		}
+		if len(powers) == 0 {
+			continue
+		}
+		p95 := benchmarkPercentile(powers, 95)
+		if p95 > 0 {
+			result[idx] = p95
+			logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
+		}
+	}
+	return result
+}
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -60,9 +60,17 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
-	if result.ParallelGPUs {
+	if result.RampStep > 0 && result.RampTotal > 0 {
+		fmt.Fprintf(&b, "**Ramp-up step:** %d of %d  \n", result.RampStep, result.RampTotal)
+		if result.RampRunID != "" {
+			fmt.Fprintf(&b, "**Ramp-up run ID:** %s  \n", result.RampRunID)
+		}
+	} else if result.ParallelGPUs {
 		fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously)  \n")
 	}
+	if result.ScalabilityScore > 0 {
+		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
+	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")

@@ -90,7 +98,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 	for _, gpu := range result.GPUs {
 		name := strings.TrimSpace(gpu.Name)
 		if name == "" {
-			name = "Unknown"
+			name = "Unknown GPU"
 		}
 		interconnect := "-"
 		if gpu.Scores.InterconnectScore > 0 {
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -178,3 +178,67 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
 		t.Fatalf("report should not contain ANSI escapes\n%s", report)
 	}
 }
+
+func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+	t.Parallel()
+
+	nvsmiQ := []byte(`
+GPU 00000000:4E:00.0
+    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Clocks
+        Graphics                          : 2422 MHz
+        Memory                            : 12481 MHz
+    Max Clocks
+        Graphics                          : 2430 MHz
+        SM                                : 2430 MHz
+        Memory                            : 12481 MHz
+        Video                             : 2107 MHz
+
+GPU 00000000:4F:00.0
+    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Max Clocks
+        Graphics                          : 2430 MHz
+        Memory                            : 12481 MHz
+`)
+
+	infoByIndex := map[int]benchmarkGPUInfo{
+		0: {Index: 0, BusID: "00000000:4E:00.0"},
+		1: {Index: 1, BusID: "00000000:4F:00.0"},
+	}
+
+	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+
+	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
+		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
+	}
+	if infoByIndex[0].MaxMemoryClockMHz != 12481 {
+		t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
+	}
+	if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
+		t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
+	}
+	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
+		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
+	}
+}
+
+func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+	t.Parallel()
+
+	nvsmiQ := []byte(`
+GPU 00000000:4E:00.0
+    Max Clocks
+        Graphics                          : 9999 MHz
+        Memory                            : 9999 MHz
+`)
+	// Already populated — must not be overwritten.
+	infoByIndex := map[int]benchmarkGPUInfo{
+		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+	}
+
+	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+
+	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
+		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
+	}
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -2,6 +2,29 @@ package platform

 import "time"

+// BenchmarkHostConfig holds static CPU and memory configuration captured at
+// benchmark start. Useful for correlating results across runs on different hardware.
+type BenchmarkHostConfig struct {
+	CPUModel    string  `json:"cpu_model,omitempty"`
+	CPUSockets  int     `json:"cpu_sockets,omitempty"`
+	CPUCores    int     `json:"cpu_cores,omitempty"`
+	CPUThreads  int     `json:"cpu_threads,omitempty"`
+	MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
+}
+
+// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
+// steady-state phase. High or unstable CPU load during a GPU benchmark may
+// indicate a competing workload or a CPU-bound driver bottleneck.
+type BenchmarkCPULoad struct {
+	AvgPct  float64 `json:"avg_pct"`
+	MaxPct  float64 `json:"max_pct"`
+	P95Pct  float64 `json:"p95_pct"`
+	Samples int     `json:"samples"`
+	// Status is "ok", "high", or "unstable".
+	Status string `json:"status"`
+	Note   string `json:"note,omitempty"`
+}
+
 const (
 	NvidiaBenchmarkProfileStandard  = "standard"
 	NvidiaBenchmarkProfileStability = "stability"
@@ -14,7 +37,10 @@ type NvidiaBenchmarkOptions struct {
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
-	ParallelGPUs      bool // run all selected GPUs simultaneously instead of sequentially
+	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
+	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
+	RampTotal         int    // total number of ramp-up steps in this run
+	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }


@@ -25,11 +51,17 @@ type NvidiaBenchmarkResult struct {
 	ServerModel        string                       `json:"server_model,omitempty"`
 	BenchmarkProfile   string                       `json:"benchmark_profile"`
 	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
+	RampStep           int                          `json:"ramp_step,omitempty"`
+	RampTotal          int                          `json:"ramp_total,omitempty"`
+	RampRunID          string                       `json:"ramp_run_id,omitempty"`
+	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`
 	Warnings           []string                     `json:"warnings,omitempty"`
 	Normalization      BenchmarkNormalization       `json:"normalization"`
+	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
@@ -63,6 +95,11 @@ type BenchmarkGPUResult struct {
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
 	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
+	// CalibratedPeakPowerW is the p95 power measured during a short
+	// dcgmi targeted_power calibration run before the main benchmark.
+	// Used as the reference denominator for PowerSustainScore instead of
+	// the hardware default limit, which bee-gpu-burn cannot reach.
+	CalibratedPeakPowerW   float64                    `json:"calibrated_peak_power_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 }

 const (
-	ansiRed    = "\033[31m"
-	ansiBlue   = "\033[34m"
-	ansiGreen  = "\033[32m"
-	ansiYellow = "\033[33m"
+	ansiAmber  = "\033[38;5;214m"
 	ansiReset  = "\033[0m"
 )

@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
 		fn      func(GPUMetricRow) float64
 	}
 	defs := []seriesDef{
-		{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
-		{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
-		{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
-		{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
+		{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
+		{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
+		{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
+		{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
 	}

 	var b strings.Builder
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -14,9 +14,17 @@ import (
 func (s *System) IsLiveMediaInRAM() bool {
 	fsType := mountFSType("/run/live/medium")
 	if fsType == "" {
+		// No medium mount at all — fall back to toram kernel parameter.
 		return toramActive()
 	}
-	return strings.EqualFold(fsType, "tmpfs")
+	if strings.EqualFold(fsType, "tmpfs") {
+		return true
+	}
+	// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
+	// mount of /run/live/medium fails (common for CD-ROM boots), the medium
+	// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
+	files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
+	return len(files) > 0
 }

 func (s *System) LiveBootSource() LiveBootSource {
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"--seconds", strconv.Itoa(opts.DurationSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
+		if opts.StaggerSeconds > 0 && len(selected) > 1 {
+			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
+		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"bee-john-gpu-stress",
 			"--seconds", strconv.Itoa(opts.DurationSec),
 		}
+		if opts.StaggerSeconds > 0 && len(selected) > 1 {
+			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
+		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
 	}
 	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)

-	// Pack tar.gz
-	archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
-	if err := packPlatformDir(runDir, archivePath); err != nil {
-		return "", fmt.Errorf("pack archive: %w", err)
-	}
-	_ = os.RemoveAll(runDir)
-	return archivePath, nil
+	return runDir, nil
 }

 // collectPhase samples live metrics every second until ctx is done.
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -1,6 +1,7 @@
 package platform

 import (
+	"bufio"
 	"os"
 	"os/exec"
 	"strings"
@@ -114,6 +115,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 	}

 	s.collectGPURuntimeHealth(vendor, &health)
+	s.collectToRAMHealth(&health)
+	s.collectUSBExportHealth(&health)

 	if health.Status != "FAILED" && len(health.Issues) > 0 {
 		health.Status = "PARTIAL"
@@ -168,6 +171,96 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
 	return ToolStatus{Name: display}
 }

+// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
+// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
+// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
+func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
+	inRAM := s.IsLiveMediaInRAM()
+	active := toramActive()
+	switch {
+	case inRAM:
+		health.ToRAMStatus = "ok"
+	case active:
+		// toram was requested but medium is not yet/no longer in RAM
+		health.ToRAMStatus = "failed"
+		health.Issues = append(health.Issues, schema.RuntimeIssue{
+			Code:        "toram_copy_failed",
+			Severity:    "warning",
+			Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
+		})
+	default:
+		health.ToRAMStatus = "warning"
+	}
+}
+
+// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
+// suitable for log export. Sets USBExportPath to the first match found.
+func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
+	health.USBExportPath = findUSBExportMount()
+}
+
+// findUSBExportMount returns the mount point of the first writable USB filesystem
+// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
+// has USB transport. Returns "" if none found.
+func findUSBExportMount() string {
+	f, err := os.Open("/proc/mounts")
+	if err != nil {
+		return ""
+	}
+	defer f.Close()
+
+	// fs types that are expected on USB export drives
+	exportFSTypes := map[string]bool{
+		"vfat":  true,
+		"exfat": true,
+		"ext2":  true,
+		"ext3":  true,
+		"ext4":  true,
+		"ntfs":  true,
+		"ntfs3": true,
+		"fuseblk": true,
+	}
+
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		// fields: device mountpoint fstype options dump pass
+		fields := strings.Fields(scanner.Text())
+		if len(fields) < 4 {
+			continue
+		}
+		device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
+		if !exportFSTypes[strings.ToLower(fsType)] {
+			continue
+		}
+		// Skip read-only mounts
+		opts := strings.Split(options, ",")
+		readOnly := false
+		for _, o := range opts {
+			if strings.TrimSpace(o) == "ro" {
+				readOnly = true
+				break
+			}
+		}
+		if readOnly {
+			continue
+		}
+		// Check USB transport via lsblk on the device (or its parent disk for partitions).
+		if !strings.HasPrefix(device, "/dev/") {
+			continue
+		}
+		checkDev := device
+		// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
+		// Strip trailing partition digits to get the parent disk name.
+		if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
+			checkDev = trimmed
+		}
+		if blockDeviceTransport(checkDev) == "usb" {
+			return mountPoint
+		}
+	}
+	return ""
+}
+
 func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
 	lsmodText := commandText("lsmod")

--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -384,25 +384,39 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 	), logFunc)
 }

-func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
-	profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
-	if err != nil {
-		return "", err
+	var (
+		profCmd []string
+		profEnv []string
+	)
+	if staggerSec > 0 && len(selected) > 1 {
+		profCmd = []string{
+			"bee-dcgmproftester-staggered",
+			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
+			"--stagger-seconds", strconv.Itoa(staggerSec),
+			"--devices", joinIndexList(selected),
+		}
+	} else {
+		profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
+		if err != nil {
+			return "", err
+		}
+		profEnv = nvidiaVisibleDevicesEnv(selected)
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
-		satJob{
-			name:       "03-dcgmproftester.log",
-			cmd:        profCmd,
-			env:        nvidiaVisibleDevicesEnv(selected),
-			collectGPU: true,
-			gpuIndices: selected,
-		},
+			satJob{
+				name:       "03-dcgmproftester.log",
+				cmd:        profCmd,
+				env:        profEnv,
+				collectGPU: true,
+				gpuIndices: selected,
+			},
 		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
@@ -648,11 +662,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
-	archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", err
-	}
-	return archive, nil
+	return runDir, nil
 }

 type satJob struct {
@@ -838,11 +848,7 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		}
 	}

-	archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", err
-	}
-	return archive, nil
+	return runDir, nil
 }

 func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
@@ -905,7 +911,7 @@ func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPU
 			entry.Health = "UNKNOWN"
 		}
 		if entry.Name == "" {
-			entry.Name = "unknown"
+			entry.Name = "Unknown GPU"
 		}
 		var body strings.Builder
 		fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -223,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		return "", err
 	}

-	archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
-	if err := createTarGz(archive, runDir); err != nil {
-		return "", err
-	}
-	return archive, nil
+	return runDir, nil
 }

 func applyFanStressDefaults(opts *FanStressOptions) {
--- a/audit/internal/platform/techdump.go
+++ b/audit/internal/platform/techdump.go
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
 	{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
 	{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
 	{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
+	{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
 	{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
 	{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
 	{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
 	Loader            string
 	GPUIndices        []int
 	ExcludeGPUIndices []int
+	StaggerSeconds    int
 }

 func New() *System {
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -22,6 +22,10 @@ type RuntimeHealth struct {
 	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
 	NvidiaGSPMode string                 `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string                 `json:"network_status,omitempty"`
+	// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
+	ToRAMStatus   string `json:"toram_status,omitempty"`
+	// USBExportPath: mount point of the first writable USB drive found, empty if none.
+	USBExportPath string `json:"usb_export_path,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
 	Services      []RuntimeServiceStatus `json:"services,omitempty"`
@@ -183,6 +187,13 @@ type HardwarePCIeDevice struct {
 	BatteryTemperatureC    *float64       `json:"battery_temperature_c,omitempty"`
 	BatteryVoltageV        *float64       `json:"battery_voltage_v,omitempty"`
 	BatteryReplaceRequired *bool          `json:"battery_replace_required,omitempty"`
+	SFPPresent             *bool          `json:"sfp_present,omitempty"`
+	SFPIdentifier          *string        `json:"sfp_identifier,omitempty"`
+	SFPConnector           *string        `json:"sfp_connector,omitempty"`
+	SFPVendor              *string        `json:"sfp_vendor,omitempty"`
+	SFPPartNumber          *string        `json:"sfp_part_number,omitempty"`
+	SFPSerialNumber        *string        `json:"sfp_serial_number,omitempty"`
+	SFPWavelengthNM        *float64       `json:"sfp_wavelength_nm,omitempty"`
 	SFPTemperatureC        *float64       `json:"sfp_temperature_c,omitempty"`
 	SFPTXPowerDBM          *float64       `json:"sfp_tx_power_dbm,omitempty"`
 	SFPRXPowerDBM          *float64       `json:"sfp_rx_power_dbm,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -12,6 +12,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"sort"
+	"strconv"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -209,6 +210,14 @@ func joinTaskIndices(indices []int) string {
 	return strings.Join(parts, ",")
 }

+func formatGPUIndexList(indices []int) string {
+	parts := make([]string, len(indices))
+	for i, idx := range indices {
+		parts[i] = strconv.Itoa(idx)
+	}
+	return strings.Join(parts, ",")
+}
+
 func formatSplitTaskName(baseName, selectionLabel string) string {
 	baseName = strings.TrimSpace(baseName)
 	selectionLabel = strings.TrimSpace(selectionLabel)
@@ -482,12 +491,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 			return
 		}

-		var body struct {
-			Duration           int      `json:"duration"`
-			StressMode         bool     `json:"stress_mode"`
-			GPUIndices         []int    `json:"gpu_indices"`
-			ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
-			Loader             string   `json:"loader"`
+			var body struct {
+				Duration           int      `json:"duration"`
+				StressMode         bool     `json:"stress_mode"`
+				GPUIndices         []int    `json:"gpu_indices"`
+				ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
+				StaggerGPUStart    bool     `json:"stagger_gpu_start"`
+				Loader             string   `json:"loader"`
 			Profile            string   `json:"profile"`
 			DisplayName        string   `json:"display_name"`
 			PlatformComponents []string `json:"platform_components"`
@@ -503,12 +513,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		if strings.TrimSpace(body.DisplayName) != "" {
 			name = body.DisplayName
 		}
-		params := taskParams{
-			Duration:           body.Duration,
-			StressMode:         body.StressMode,
-			GPUIndices:         body.GPUIndices,
-			ExcludeGPUIndices:  body.ExcludeGPUIndices,
-			Loader:             body.Loader,
+			params := taskParams{
+				Duration:           body.Duration,
+				StressMode:         body.StressMode,
+				GPUIndices:         body.GPUIndices,
+				ExcludeGPUIndices:  body.ExcludeGPUIndices,
+				StaggerGPUStart:    body.StaggerGPUStart,
+				Loader:             body.Loader,
 			BurnProfile:        body.Profile,
 			DisplayName:        body.DisplayName,
 			PlatformComponents: body.PlatformComponents,
@@ -538,6 +549,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
 		RunNCCL           *bool  `json:"run_nccl"`
 		ParallelGPUs      *bool  `json:"parallel_gpus"`
+		RampUp            *bool  `json:"ramp_up"`
 		DisplayName       string `json:"display_name"`
 	}
 	if r.Body != nil {
@@ -555,10 +567,82 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 	if body.ParallelGPUs != nil {
 		parallelGPUs = *body.ParallelGPUs
 	}
+	rampUp := false
+	if body.RampUp != nil {
+		rampUp = *body.RampUp
+	}
+	// Build a descriptive base name that includes profile and mode so the task
+	// list is self-explanatory without opening individual task detail pages.
+	profile := strings.TrimSpace(body.Profile)
+	if profile == "" {
+		profile = "standard"
+	}
 	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
 		name = body.DisplayName
 	}
+	// Append profile tag.
+	name = fmt.Sprintf("%s · %s", name, profile)
+
+	if rampUp && len(body.GPUIndices) > 1 {
+		// Ramp-up mode: resolve GPU list, then create one task per prefix
+		// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
+		gpus, err := apiListNvidiaGPUs(h.opts.App)
+		if err != nil {
+			writeError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
+		if err != nil {
+			writeError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		if len(resolved) < 2 {
+			// Fall through to normal single-task path.
+			rampUp = false
+		} else {
+			now := time.Now()
+			rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
+			var allTasks []*Task
+			for step := 1; step <= len(resolved); step++ {
+				subset := resolved[:step]
+				stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
+				t := &Task{
+					ID:        newJobID("benchmark-nvidia"),
+					Name:      stepName,
+					Target:    "nvidia-benchmark",
+					Priority:  15,
+					Status:    TaskPending,
+					CreatedAt: now,
+					params: taskParams{
+						GPUIndices:       append([]int(nil), subset...),
+						SizeMB:           body.SizeMB,
+						BenchmarkProfile: body.Profile,
+						RunNCCL:          runNCCL && step == len(resolved),
+						ParallelGPUs:     true,
+						RampStep:         step,
+						RampTotal:        len(resolved),
+						RampRunID:        rampRunID,
+						DisplayName:      stepName,
+					},
+				}
+				allTasks = append(allTasks, t)
+			}
+			for _, t := range allTasks {
+				globalQueue.enqueue(t)
+			}
+			writeTaskRunResponse(w, allTasks)
+			return
+		}
+	}
+
+	// For non-ramp tasks append mode tag.
+	if parallelGPUs {
+		name = fmt.Sprintf("%s · parallel", name)
+	} else {
+		name = fmt.Sprintf("%s · sequential", name)
+	}
+
 	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
 		GPUIndices:        body.GPUIndices,
 		ExcludeGPUIndices: body.ExcludeGPUIndices,
@@ -1376,107 +1460,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
 	return nil
 }

-// ── Display / Screen Resolution ───────────────────────────────────────────────
-
-type displayMode struct {
-	Output  string `json:"output"`
-	Mode    string `json:"mode"`
-	Current bool   `json:"current"`
-}
-
-type displayInfo struct {
-	Output  string        `json:"output"`
-	Modes   []displayMode `json:"modes"`
-	Current string        `json:"current"`
-}
-
-var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
-var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
-var xrandrCurrentRE = regexp.MustCompile(`\*`)
-
-func parseXrandrOutput(out string) []displayInfo {
-	var infos []displayInfo
-	var cur *displayInfo
-	for _, line := range strings.Split(out, "\n") {
-		if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
-			if cur != nil {
-				infos = append(infos, *cur)
-			}
-			cur = &displayInfo{Output: m[1]}
-			continue
-		}
-		if cur == nil {
-			continue
-		}
-		if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
-			isCurrent := xrandrCurrentRE.MatchString(line)
-			mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
-			cur.Modes = append(cur.Modes, mode)
-			if isCurrent {
-				cur.Current = m[1]
-			}
-		}
-	}
-	if cur != nil {
-		infos = append(infos, *cur)
-	}
-	return infos
-}
-
-func xrandrCommand(args ...string) *exec.Cmd {
-	cmd := exec.Command("xrandr", args...)
-	env := append([]string{}, os.Environ()...)
-	hasDisplay := false
-	hasXAuthority := false
-	for _, kv := range env {
-		if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
-			hasDisplay = true
-		}
-		if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
-			hasXAuthority = true
-		}
-	}
-	if !hasDisplay {
-		env = append(env, "DISPLAY=:0")
-	}
-	if !hasXAuthority {
-		env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
-	}
-	cmd.Env = env
-	return cmd
-}
-
-func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
-	out, err := xrandrCommand().Output()
-	if err != nil {
-		writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
-		return
-	}
-	writeJSON(w, parseXrandrOutput(string(out)))
-}
-
-func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
-	var req struct {
-		Output string `json:"output"`
-		Mode   string `json:"mode"`
-	}
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
-		writeError(w, http.StatusBadRequest, "output and mode are required")
-		return
-	}
-	// Validate mode looks like WxH to prevent injection
-	if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
-		writeError(w, http.StatusBadRequest, "invalid mode format")
-		return
-	}
-	// Validate output name (no special chars)
-	if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
-		writeError(w, http.StatusBadRequest, "invalid output name")
-		return
-	}
-	if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
-		writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
-		return
-	}
-	writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
-}
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -10,30 +10,6 @@ import (
 	"bee/audit/internal/platform"
 )

-func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
-	t.Setenv("DISPLAY", "")
-	t.Setenv("XAUTHORITY", "")
-
-	cmd := xrandrCommand("--query")
-
-	var hasDisplay bool
-	var hasXAuthority bool
-	for _, kv := range cmd.Env {
-		if kv == "DISPLAY=:0" {
-			hasDisplay = true
-		}
-		if kv == "XAUTHORITY=/home/bee/.Xauthority" {
-			hasXAuthority = true
-		}
-	}
-	if !hasDisplay {
-		t.Fatalf("DISPLAY not injected: %v", cmd.Env)
-	}
-	if !hasXAuthority {
-		t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
-	}
-}
-
 func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
 		}
 	}

+	// Downsample to at most ~1400 points (one per pixel) before building SVG.
+	times, datasets = downsampleTimeSeries(times, datasets, 1400)
+	pointCount = len(times)
+
 	statsLabel := chartStatsLabel(datasets)

 	legendItems := []metricChartSeries{}
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
 		}
 	}

+	// Downsample to at most ~1400 points before building SVG.
+	{
+		datasets := make([][]float64, len(series))
+		for i := range series {
+			datasets[i] = series[i].Values
+		}
+		times, datasets = downsampleTimeSeries(times, datasets, 1400)
+		pointCount = len(times)
+		for i := range series {
+			series[i].Values = datasets[i]
+		}
+	}
+
 	scales := make([]chartScale, len(series))
 	for i := range series {
 		min, max := chartSeriesBounds(series[i].Values)
@@ -626,6 +643,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
 	b.WriteString(`</g>` + "\n")
 }

+// downsampleTimeSeries reduces the time series to at most maxPts points using
+// min-max bucketing. Each bucket contributes the index of its min and max value
+// (using the first full-length dataset as the reference series). All parallel
+// datasets are sampled at those same indices so all series stay aligned.
+// If len(times) <= maxPts the inputs are returned unchanged.
+func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
+	n := len(times)
+	if n <= maxPts || maxPts <= 0 {
+		return times, datasets
+	}
+	buckets := maxPts / 2
+	if buckets < 1 {
+		buckets = 1
+	}
+	// Use the first dataset that has the same length as times as the reference
+	// for deciding which two indices to keep per bucket.
+	var ref []float64
+	for _, ds := range datasets {
+		if len(ds) == n {
+			ref = ds
+			break
+		}
+	}
+	selected := make([]int, 0, maxPts)
+	bucketSize := float64(n) / float64(buckets)
+	for b := 0; b < buckets; b++ {
+		lo := int(math.Round(float64(b) * bucketSize))
+		hi := int(math.Round(float64(b+1) * bucketSize))
+		if hi > n {
+			hi = n
+		}
+		if lo >= hi {
+			continue
+		}
+		if ref == nil {
+			selected = append(selected, lo)
+			if hi-1 != lo {
+				selected = append(selected, hi-1)
+			}
+			continue
+		}
+		minIdx, maxIdx := lo, lo
+		for i := lo + 1; i < hi; i++ {
+			if ref[i] < ref[minIdx] {
+				minIdx = i
+			}
+			if ref[i] > ref[maxIdx] {
+				maxIdx = i
+			}
+		}
+		if minIdx <= maxIdx {
+			selected = append(selected, minIdx)
+			if maxIdx != minIdx {
+				selected = append(selected, maxIdx)
+			}
+		} else {
+			selected = append(selected, maxIdx)
+			if minIdx != maxIdx {
+				selected = append(selected, minIdx)
+			}
+		}
+	}
+	outTimes := make([]time.Time, len(selected))
+	for i, idx := range selected {
+		outTimes[i] = times[idx]
+	}
+	outDatasets := make([][]float64, len(datasets))
+	for d, ds := range datasets {
+		if len(ds) != n {
+			outDatasets[d] = ds
+			continue
+		}
+		out := make([]float64, len(selected))
+		for i, idx := range selected {
+			out[i] = ds[idx]
+		}
+		outDatasets[d] = out
+	}
+	return outTimes, outDatasets
+}
+
 func chartXForTime(ts, start, end time.Time, left, right int) float64 {
 	if !end.After(start) {
 		return float64(left+right) / 2
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -317,106 +317,326 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
 	if err != nil {
 		return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
 	}
-	// Parse just enough fields for the summary banner
-	var snap struct {
-		Summary struct {
-			CPU     struct{ Model string }
-			Memory  struct{ TotalGB float64 }
-			Storage []struct{ Device, Model, Size string }
-			GPUs    []struct{ Model string }
-			PSUs    []struct{ Model string }
-		}
-		Network struct {
-			Interfaces []struct {
-				Name  string
-				IPv4  []string
-				State string
-			}
-		}
-	}
-	// Try to extract top-level fields loosely
-	var raw map[string]json.RawMessage
-	if err := json.Unmarshal(data, &raw); err != nil {
+	var ingest schema.HardwareIngestRequest
+	if err := json.Unmarshal(data, &ingest); err != nil {
 		return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
 	}
-	_ = snap
+	hw := ingest.Hardware

-	// Also load runtime-health for badges
-	type componentHealth struct {
-		FailCount int `json:"fail_count"`
-		WarnCount int `json:"warn_count"`
+	var records []app.ComponentStatusRecord
+	if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil {
+		records = db.All()
 	}
-	type healthSummary struct {
-		CPU     componentHealth `json:"cpu"`
-		Memory  componentHealth `json:"memory"`
-		Storage componentHealth `json:"storage"`
-		GPU     componentHealth `json:"gpu"`
-		PSU     componentHealth `json:"psu"`
-		Network componentHealth `json:"network"`
-	}
-	var health struct {
-		HardwareHealth healthSummary `json:"hardware_health"`
-	}
-	if hdata, herr := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")); herr == nil {
-		_ = json.Unmarshal(hdata, &health)
-	}
-
-	badge := func(h componentHealth) string {
-		if h.FailCount > 0 {
-			return `<span class="badge badge-err">FAIL</span>`
-		}
-		if h.WarnCount > 0 {
-			return `<span class="badge badge-warn">WARN</span>`
-		}
-		return `<span class="badge badge-ok">OK</span>`
-	}
-
-	// Extract readable strings from raw JSON
-	getString := func(key string) string {
-		v, ok := raw[key]
-		if !ok {
-			return ""
-		}
-		var s string
-		if err := json.Unmarshal(v, &s); err == nil {
-			return s
-		}
-		return ""
-	}
-
-	cpuModel := getString("cpu_model")
-	memStr := getString("memory_summary")
-	gpuSummary := getString("gpu_summary")

 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
+
+	// Server identity block above the component table.
+	{
+		var model, serial string
+		parts := []string{}
+		if hw.Board.Manufacturer != nil && strings.TrimSpace(*hw.Board.Manufacturer) != "" {
+			parts = append(parts, strings.TrimSpace(*hw.Board.Manufacturer))
+		}
+		if hw.Board.ProductName != nil && strings.TrimSpace(*hw.Board.ProductName) != "" {
+			parts = append(parts, strings.TrimSpace(*hw.Board.ProductName))
+		}
+		if len(parts) > 0 {
+			model = strings.Join(parts, " ")
+		}
+		serial = strings.TrimSpace(hw.Board.SerialNumber)
+		if model != "" || serial != "" {
+			b.WriteString(`<div style="margin-bottom:14px">`)
+			if model != "" {
+				fmt.Fprintf(&b, `<div style="font-size:16px;font-weight:700;margin-bottom:2px">%s</div>`, html.EscapeString(model))
+			}
+			if serial != "" {
+				fmt.Fprintf(&b, `<div style="font-size:12px;color:var(--muted)">S/N: %s</div>`, html.EscapeString(serial))
+			}
+			b.WriteString(`</div>`)
+		}
+	}
+
 	b.WriteString(`<table style="width:auto">`)
 	writeRow := func(label, value, badgeHTML string) {
-		b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
+		b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
 			html.EscapeString(label), html.EscapeString(value), badgeHTML))
 	}
-	if cpuModel != "" {
-		writeRow("CPU", cpuModel, badge(health.HardwareHealth.CPU))
-	} else {
-		writeRow("CPU", "—", badge(health.HardwareHealth.CPU))
+
+	cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
+	writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
+
+	memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
+	writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
+
+	storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
+	writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
+
+	gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
+	writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
+
+	psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
+	if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
+		psuRow.Status = hwPSUStatus(hw.PowerSupplies)
 	}
-	if memStr != "" {
-		writeRow("Memory", memStr, badge(health.HardwareHealth.Memory))
-	} else {
-		writeRow("Memory", "—", badge(health.HardwareHealth.Memory))
+	writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
+
+	if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
+		writeRow("Network", nicDesc, "")
 	}
-	if gpuSummary != "" {
-		writeRow("GPU", gpuSummary, badge(health.HardwareHealth.GPU))
-	} else {
-		writeRow("GPU", "—", badge(health.HardwareHealth.GPU))
-	}
-	writeRow("Storage", "—", badge(health.HardwareHealth.Storage))
-	writeRow("PSU", "—", badge(health.HardwareHealth.PSU))
+
 	b.WriteString(`</table>`)
 	b.WriteString(`</div></div>`)
 	return b.String()
 }

+// hwDescribeCPU returns a human-readable CPU summary, e.g. "2× Intel Xeon Gold 6338".
+func hwDescribeCPU(hw schema.HardwareSnapshot) string {
+	counts := map[string]int{}
+	order := []string{}
+	for _, cpu := range hw.CPUs {
+		model := "Unknown CPU"
+		if cpu.Model != nil && *cpu.Model != "" {
+			model = *cpu.Model
+		}
+		if counts[model] == 0 {
+			order = append(order, model)
+		}
+		counts[model]++
+	}
+	if len(order) == 0 {
+		return "—"
+	}
+	parts := make([]string, 0, len(order))
+	for _, m := range order {
+		if counts[m] > 1 {
+			parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
+		} else {
+			parts = append(parts, m)
+		}
+	}
+	return strings.Join(parts, ", ")
+}
+
+// hwDescribeMemory returns a summary like "16× 32 GB DDR4".
+func hwDescribeMemory(hw schema.HardwareSnapshot) string {
+	type key struct {
+		sizeMB int
+		typ    string
+	}
+	counts := map[key]int{}
+	order := []key{}
+	for _, dimm := range hw.Memory {
+		if dimm.SizeMB == nil || *dimm.SizeMB == 0 {
+			continue
+		}
+		t := ""
+		if dimm.Type != nil {
+			t = *dimm.Type
+		}
+		k := key{*dimm.SizeMB, t}
+		if counts[k] == 0 {
+			order = append(order, k)
+		}
+		counts[k]++
+	}
+	if len(order) == 0 {
+		return "—"
+	}
+	parts := make([]string, 0, len(order))
+	for _, k := range order {
+		gb := k.sizeMB / 1024
+		desc := fmt.Sprintf("%d× %d GB", counts[k], gb)
+		if k.typ != "" {
+			desc += " " + k.typ
+		}
+		parts = append(parts, desc)
+	}
+	return strings.Join(parts, ", ")
+}
+
+// hwDescribeStorage returns a summary like "4× 3.84 TB NVMe, 2× 1.92 TB SATA".
+func hwDescribeStorage(hw schema.HardwareSnapshot) string {
+	type key struct {
+		sizeGB int
+		iface  string
+	}
+	counts := map[key]int{}
+	order := []key{}
+	for _, disk := range hw.Storage {
+		sz := 0
+		if disk.SizeGB != nil {
+			sz = *disk.SizeGB
+		}
+		iface := ""
+		if disk.Interface != nil {
+			iface = *disk.Interface
+		} else if disk.Type != nil {
+			iface = *disk.Type
+		}
+		k := key{sz, iface}
+		if counts[k] == 0 {
+			order = append(order, k)
+		}
+		counts[k]++
+	}
+	if len(order) == 0 {
+		return "—"
+	}
+	parts := make([]string, 0, len(order))
+	for _, k := range order {
+		var sizeStr string
+		if k.sizeGB >= 1000 {
+			sizeStr = fmt.Sprintf("%.2g TB", float64(k.sizeGB)/1000)
+		} else if k.sizeGB > 0 {
+			sizeStr = fmt.Sprintf("%d GB", k.sizeGB)
+		} else {
+			sizeStr = "?"
+		}
+		desc := fmt.Sprintf("%d× %s", counts[k], sizeStr)
+		if k.iface != "" {
+			desc += " " + k.iface
+		}
+		parts = append(parts, desc)
+	}
+	return strings.Join(parts, ", ")
+}
+
+// hwDescribeGPU returns a summary like "8× NVIDIA H100 80GB".
+func hwDescribeGPU(hw schema.HardwareSnapshot) string {
+	counts := map[string]int{}
+	order := []string{}
+	for _, dev := range hw.PCIeDevices {
+		if dev.DeviceClass == nil {
+			continue
+		}
+		if !isGPUDeviceClass(*dev.DeviceClass) {
+			continue
+		}
+		model := "Unknown GPU"
+		if dev.Model != nil && *dev.Model != "" {
+			model = *dev.Model
+		}
+		if counts[model] == 0 {
+			order = append(order, model)
+		}
+		counts[model]++
+	}
+	if len(order) == 0 {
+		return "—"
+	}
+	parts := make([]string, 0, len(order))
+	for _, m := range order {
+		if counts[m] > 1 {
+			parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
+		} else {
+			parts = append(parts, m)
+		}
+	}
+	return strings.Join(parts, ", ")
+}
+
+// hwPSUStatus returns "OK", "CRITICAL", "WARNING", or "UNKNOWN" based on
+// PSU statuses from the audit snapshot. Used as fallback when component-status.json
+// has no psu: records yet (e.g. first boot before audit writes them).
+func hwPSUStatus(psus []schema.HardwarePowerSupply) string {
+	worst := "UNKNOWN"
+	for _, psu := range psus {
+		if psu.Status == nil {
+			continue
+		}
+		switch strings.ToUpper(strings.TrimSpace(*psu.Status)) {
+		case "CRITICAL":
+			return "CRITICAL"
+		case "WARNING":
+			if worst != "CRITICAL" {
+				worst = "WARNING"
+			}
+		case "OK":
+			if worst == "UNKNOWN" {
+				worst = "OK"
+			}
+		}
+	}
+	return worst
+}
+
+// hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
+func hwDescribePSU(hw schema.HardwareSnapshot) string {
+	n := len(hw.PowerSupplies)
+	if n == 0 {
+		return "—"
+	}
+	// Try to get a consistent wattage
+	watt := 0
+	consistent := true
+	for _, psu := range hw.PowerSupplies {
+		if psu.WattageW == nil {
+			consistent = false
+			break
+		}
+		if watt == 0 {
+			watt = *psu.WattageW
+		} else if *psu.WattageW != watt {
+			consistent = false
+			break
+		}
+	}
+	if consistent && watt > 0 {
+		return fmt.Sprintf("%d× %d W", n, watt)
+	}
+	return fmt.Sprintf("%d× PSU", n)
+}
+
+// hwDescribeNIC returns a summary like "2× Mellanox ConnectX-6".
+func hwDescribeNIC(hw schema.HardwareSnapshot) string {
+	counts := map[string]int{}
+	order := []string{}
+	for _, dev := range hw.PCIeDevices {
+		isNIC := false
+		if dev.DeviceClass != nil {
+			c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
+			isNIC = c == "ethernetcontroller" || c == "networkcontroller" || strings.Contains(c, "fibrechannel")
+		}
+		if !isNIC && len(dev.MacAddresses) == 0 {
+			continue
+		}
+		model := ""
+		if dev.Model != nil && *dev.Model != "" {
+			model = *dev.Model
+		} else if dev.Manufacturer != nil && *dev.Manufacturer != "" {
+			model = *dev.Manufacturer + " NIC"
+		} else {
+			model = "NIC"
+		}
+		if counts[model] == 0 {
+			order = append(order, model)
+		}
+		counts[model]++
+	}
+	if len(order) == 0 {
+		return ""
+	}
+	parts := make([]string, 0, len(order))
+	for _, m := range order {
+		if counts[m] > 1 {
+			parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
+		} else {
+			parts = append(parts, m)
+		}
+	}
+	return strings.Join(parts, ", ")
+}
+
+func isGPUDeviceClass(class string) bool {
+	switch strings.TrimSpace(class) {
+	case "VideoController", "DisplayController", "ProcessingAccelerator":
+		return true
+	default:
+		return false
+	}
+}
+
 func renderAuditModal() string {
 	return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
  <div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative">
@@ -481,8 +701,9 @@ func renderHealthCard(opts HandlerOptions) string {
 		buildRuntimeAccelerationRow(health),
 		buildRuntimeToolsRow(health),
 		buildRuntimeServicesRow(health),
+		buildRuntimeUSBExportRow(health),
+		buildRuntimeToRAMRow(health),
 	}
-	rows = append(rows, buildHardwareComponentRows(opts.ExportDir)...)
 	b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
 	for _, row := range rows {
 		b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
@@ -578,7 +799,13 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
 	nonActive := make([]string, 0)
 	for _, svc := range health.Services {
 		state := strings.TrimSpace(strings.ToLower(svc.Status))
-		if state != "active" {
+		// "activating" and "deactivating" are transient states for oneshot services
+		// (RemainAfterExit=yes) — the service is running normally, not failed.
+		// Only "failed" and "inactive" (after services should be running) are problems.
+		switch state {
+		case "active", "activating", "deactivating", "reloading":
+			// OK — service is running or transitioning normally
+		default:
 			nonActive = append(nonActive, svc.Name+"="+svc.Status)
 		}
 	}
@@ -591,6 +818,51 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
 	return runtimeHealthRow{Title: "Bee Services", Status: status, Source: "ServiceState", Issue: issue}
 }

+func buildRuntimeUSBExportRow(health schema.RuntimeHealth) runtimeHealthRow {
+	path := strings.TrimSpace(health.USBExportPath)
+	if path != "" {
+		return runtimeHealthRow{
+			Title:  "USB Export Drive",
+			Status: "OK",
+			Source: "/proc/mounts + lsblk",
+			Issue:  path,
+		}
+	}
+	return runtimeHealthRow{
+		Title:  "USB Export Drive",
+		Status: "WARNING",
+		Source: "/proc/mounts + lsblk",
+		Issue:  "No writable USB drive mounted. Plug in a USB drive to enable log export.",
+	}
+}
+
+func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
+	switch strings.ToLower(strings.TrimSpace(health.ToRAMStatus)) {
+	case "ok":
+		return runtimeHealthRow{
+			Title:  "LiveCD in RAM",
+			Status: "OK",
+			Source: "live-boot / /proc/mounts",
+			Issue:  "",
+		}
+	case "failed":
+		return runtimeHealthRow{
+			Title:  "LiveCD in RAM",
+			Status: "FAILED",
+			Source: "live-boot / /proc/mounts",
+			Issue:  "toram boot parameter set but ISO is not mounted from RAM. Copy may have failed.",
+		}
+	default:
+		// toram not active — ISO still on original boot media (USB/CD)
+		return runtimeHealthRow{
+			Title:  "LiveCD in RAM",
+			Status: "WARNING",
+			Source: "live-boot / /proc/mounts",
+			Issue:  "ISO not copied to RAM. Use \u201cCopy to RAM\u201d to free the boot drive and improve performance.",
+		}
+	}
+}
+
 func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
 	path := filepath.Join(exportDir, "component-status.json")
 	db, err := app.OpenComponentStatusDB(path)
@@ -1031,25 +1303,23 @@ func renderValidate(opts HandlerOptions) string {
 	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

-<div class="card" style="margin-bottom:16px">
-  <div class="card-head">Validate Profile</div>
-  <div class="card-body validate-profile-body">
-    <div class="validate-profile-col">
-      <div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
-      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
-      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
-      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
-    </div>
-    <div class="validate-profile-col validate-profile-action">
-      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
-      <button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
-    </div>
-    <div class="validate-profile-col"></div>
-  </div>
-  <div class="card-body" style="padding-top:0;display:flex;justify-content:center">
-    <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
-  </div>
-</div>
+	<div class="card" style="margin-bottom:16px">
+	  <div class="card-head">Validate Profile</div>
+	  <div class="card-body validate-profile-body">
+	    <div class="validate-profile-col">
+	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
+	    </div>
+	    <div class="validate-profile-col validate-profile-action">
+	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
+	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
+	      <div style="margin-top:12px">
+	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
+	      </div>
+	    </div>
+	  </div>
+	</div>

 <div class="grid3">
 ` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
@@ -1085,22 +1355,16 @@ func renderValidate(opts HandlerOptions) string {
      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
    </div>
    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
-    <div style="margin-top:10px;padding-top:10px;border-top:1px solid var(--border)">
-      <label class="sat-gpu-row" title="When checked, multi-GPU tests (PSU Pulse, NCCL, NVBandwidth) run on ALL GPUs in the system regardless of the selection above.">
-        <input type="checkbox" id="sat-multi-gpu-all" checked onchange="satUpdateGPUSelectionNote()">
-        <span><strong>Multi-GPU tests</strong> — use all GPUs <span style="font-size:11px;color:var(--muted)">(PSU Pulse, NCCL, NVBandwidth)</span></span>
-      </label>
-    </div>
  </div>
 </div>

 <div class="grid3">
 ` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
-			inv.NVIDIA,
-			`Runs NVIDIA diagnostics and board inventory checks.`,
-			`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-			`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
-		)) +
+		inv.NVIDIA,
+		`Runs NVIDIA diagnostics and board inventory checks.`,
+		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
+		`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
+	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
@@ -1156,7 +1420,7 @@ func renderValidate(opts HandlerOptions) string {
 </div>
 <style>
 .validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
-.validate-profile-col { min-width:0; }
+.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
 .validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
 .validate-card-body { padding:0; }
 .validate-card-section { padding:12px 16px 0; }
@@ -1188,7 +1452,7 @@ function satModeChanged() {
  });
 }
 function satLabels() {
-  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
@@ -1209,10 +1473,6 @@ function satSelectedGPUIndices() {
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
-function satMultiGPUAll() {
-  const cb = document.getElementById('sat-multi-gpu-all');
-  return cb ? cb.checked : true;
-}
 function satUpdateGPUSelectionNote() {
  const note = document.getElementById('sat-gpu-selection-note');
  if (!note) return;
@@ -1221,8 +1481,7 @@ function satUpdateGPUSelectionNote() {
    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
    return;
  }
-  const multiAll = satMultiGPUAll();
-  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests: ' + (multiAll ? 'all GPUs in system' : 'selected GPUs only') + '.';
+  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
 }
 function satRenderGPUList(gpus) {
  const root = document.getElementById('sat-gpu-list');
@@ -1336,15 +1595,8 @@ const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targete
 // pulse_test and fabric tests run on all selected GPUs simultaneously
 const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
 function satAllGPUIndicesForMulti() {
-  // If "Multi-GPU tests — all GPUs" is checked, return all detected GPUs.
-  // Otherwise fall back to the per-GPU selection.
-  if (satMultiGPUAll()) {
-    return loadSatNvidiaGPUs().then(function(gpus) {
-      return gpus.map(function(g) { return Number(g.index); });
-    });
-  }
-  const sel = satSelectedGPUIndices();
-  return Promise.resolve(sel);
+  // Multi-GPU tests always use the current GPU selection.
+  return Promise.resolve(satSelectedGPUIndices());
 }
 function expandSATTarget(target) {
  if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
@@ -1434,7 +1686,7 @@ function runAMDValidateSet() {
  return runNext(0);
 }
 function runAllSAT() {
-  const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
+  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
@@ -1612,6 +1864,11 @@ func formatValidateDeviceSummary(total int, models map[string]int, unit string)
 	if total != 1 {
 		label += "s"
 	}
+	// If there is only one model the leading count duplicates the per-model
+	// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
+	if len(parts) == 1 {
+		return parts[0] + " " + label
+	}
 	return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
 }

@@ -1716,12 +1973,16 @@ func renderBenchmark(opts HandlerOptions) string {
        </div>
      </div>
      <label class="benchmark-cb-row">
-        <input type="checkbox" id="benchmark-parallel-gpus">
-        <span>Run all selected GPUs simultaneously (parallel mode)</span>
+        <input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
+        <span>Sequential — one GPU at a time</span>
      </label>
-      <label class="benchmark-cb-row">
-        <input type="checkbox" id="benchmark-run-nccl" checked>
-        <span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
+      <label class="benchmark-cb-row" id="benchmark-parallel-label">
+        <input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
+        <span>Parallel — all selected GPUs simultaneously</span>
+      </label>
+      <label class="benchmark-cb-row" id="benchmark-ramp-label">
+        <input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
+        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
      </label>
      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
      <button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button>
@@ -1774,22 +2035,28 @@ function benchmarkSelectedGPUIndices() {
    .sort(function(a, b) { return a - b; });
 }

+function benchmarkMode() {
+  const el = document.querySelector('input[name="benchmark-mode"]:checked');
+  return el ? el.value : 'sequential';
+}
+
 function benchmarkUpdateSelectionNote() {
  const selected = benchmarkSelectedGPUIndices();
  const btn = document.getElementById('benchmark-run-btn');
  const note = document.getElementById('benchmark-selection-note');
-  const nccl = document.getElementById('benchmark-run-nccl');
  if (!selected.length) {
    btn.disabled = true;
    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
    return;
  }
  btn.disabled = false;
-  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '.';
-  if (nccl && nccl.checked && selected.length < 2) {
-    note.textContent += ' NCCL will be skipped because fewer than 2 GPUs are selected.';
-  } else if (nccl && nccl.checked) {
-    note.textContent += ' NCCL interconnect will use only these GPUs.';
+  const mode = benchmarkMode();
+  if (mode === 'ramp-up') {
+    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.';
+  } else if (mode === 'parallel') {
+    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : '');
+  } else {
+    note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : '');
  }
 }

@@ -1807,6 +2074,33 @@ function benchmarkRenderGPUList(gpus) {
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
+  benchmarkApplyMultiGPUState(gpus.length);
+  benchmarkUpdateSelectionNote();
+}
+
+// Disable radio options that require multiple GPUs when only one is present.
+function benchmarkApplyMultiGPUState(gpuCount) {
+  var multiValues = ['parallel', 'ramp-up'];
+  var radios = document.querySelectorAll('input[name="benchmark-mode"]');
+  radios.forEach(function(el) {
+    var isMulti = multiValues.indexOf(el.value) >= 0;
+    if (gpuCount < 2 && isMulti) {
+      el.disabled = true;
+      if (el.checked) {
+        // fall back to sequential
+        var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
+        if (seq) seq.checked = true;
+      }
+      var label = el.closest('label');
+      if (label) label.style.opacity = '0.4';
+    } else {
+      el.disabled = false;
+      // restore default: ramp-up checked when ≥2 GPUs
+      if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
+      var label = el.closest('label');
+      if (label) label.style.opacity = '';
+    }
+  });
  benchmarkUpdateSelectionNote();
 }

@@ -1844,12 +2138,15 @@ function runNvidiaBenchmark() {
    return;
  }
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
-  const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked;
+  const mode = benchmarkMode();
+  const rampUp = mode === 'ramp-up' && selected.length > 1;
+  const parallelGPUs = mode === 'parallel';
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
-    run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
+    run_nccl: selected.length > 1,
    parallel_gpus: parallelGPUs,
+    ramp_up: rampUp,
    display_name: 'NVIDIA Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
@@ -1904,7 +2201,6 @@ function runNvidiaBenchmark() {
  });
 }

-document.getElementById('benchmark-run-nccl').addEventListener('change', benchmarkUpdateSelectionNote);
 benchmarkLoadGPUs();
 </script>`
 }
@@ -2082,7 +2378,7 @@ func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) strin

 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics and ` + "targeted_stress" + ` remain in <a href="/validate">Validate</a>. Burn exposes official NVIDIA load recipes by test goal plus separate custom stress tools.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

 <div class="card" style="margin-bottom:16px">
@@ -2095,11 +2391,11 @@ func renderBurn() string {
      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
    </div>
    <div class="burn-profile-col burn-profile-action">
-      <button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
+      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
      <p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
    </div>
    <div class="burn-profile-col burn-profile-action">
-      <button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
+      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
      <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
    </div>
  </div>
@@ -2116,12 +2412,26 @@ func renderBurn() string {
      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
    </div>
-    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
-      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
-    </div>
-    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
-  </div>
-</div>
+	    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+	      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+	    </div>
+	    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
+	    <div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
+	      <label class="cb-row">
+	        <input type="radio" name="burn-nvidia-mode" value="sequential" checked>
+	        <span>Sequential — selected GPUs one at a time</span>
+	      </label>
+	      <label class="cb-row" id="burn-parallel-label">
+	        <input type="radio" name="burn-nvidia-mode" value="parallel">
+	        <span>Parallel — all selected GPUs simultaneously</span>
+	      </label>
+	      <label class="cb-row" id="burn-ramp-label">
+	        <input type="radio" name="burn-nvidia-mode" value="ramp-up">
+	        <span>Ramp-up — add one GPU at a time</span>
+	      </label>
+	    </div>
+	  </div>
+	</div>

 <div class="burn-section">Core Burn Paths</div>
 <div class="grid2 burn-grid" style="margin-bottom:16px">
@@ -2147,10 +2457,6 @@ func renderBurn() string {
 </div>
 </div>

-<div class="burn-section">GPU-Specific Tests</div>
-<div class="grid2 burn-grid" style="margin-bottom:16px">
-</div>
-
 <div id="bi-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Output <span id="bi-title"></span></div>
  <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
@@ -2199,6 +2505,32 @@ function burnSelectedGPUIndices() {
    .sort(function(a, b) { return a - b; });
 }

+function burnNvidiaMode() {
+  const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
+  return el ? el.value : 'sequential';
+}
+
+function burnApplyMultiGPUState(gpuCount) {
+  var multiValues = ['parallel', 'ramp-up'];
+  var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
+  radios.forEach(function(el) {
+    var isMulti = multiValues.indexOf(el.value) >= 0;
+    if (gpuCount < 2 && isMulti) {
+      el.disabled = true;
+      if (el.checked) {
+        var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
+        if (seq) seq.checked = true;
+      }
+      var label = el.closest('label');
+      if (label) label.style.opacity = '0.4';
+    } else {
+      el.disabled = false;
+      var label = el.closest('label');
+      if (label) label.style.opacity = '';
+    }
+  });
+}
+
 function burnUpdateSelectionNote() {
  const note = document.getElementById('burn-selection-note');
  const selected = burnSelectedGPUIndices();
@@ -2223,6 +2555,7 @@ function burnRenderGPUList(gpus) {
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
+  burnApplyMultiGPUState(gpus.length);
  burnUpdateSelectionNote();
 }

@@ -2258,6 +2591,12 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
      return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
    }
    body.gpu_indices = selected;
+    const bMode = burnNvidiaMode();
+    if (bMode === 'ramp-up' && selected.length > 1) {
+      body.stagger_gpu_start = true;
+    } else if (bMode === 'parallel' && selected.length > 1) {
+      body.parallel_gpus = true;
+    }
  }
  return fetch('/api/sat/' + target + '/run', {
    method: 'POST',
@@ -2849,56 +3188,6 @@ usbRefresh();
 </script>`
 }

-// ── Display Resolution ────────────────────────────────────────────────────────
-
-func renderDisplayInline() string {
-	return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
-<div id="display-controls"></div>
-<script>
-(function(){
-function loadDisplays() {
-  fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
-    const status = document.getElementById('display-status');
-    const ctrl = document.getElementById('display-controls');
-    if (!displays || displays.length === 0) {
-      status.textContent = 'No connected displays found or xrandr not available.';
-      return;
-    }
-    status.textContent = '';
-    ctrl.innerHTML = displays.map(d => {
-      const opts = (d.modes||[]).map(m =>
-        '<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
-      ).join('');
-      return '<div style="margin-bottom:12px">'
-        +'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
-        +'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
-        +'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
-        +'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
-        +'</div>';
-    }).join('');
-  }).catch(()=>{
-    document.getElementById('display-status').textContent = 'xrandr not available on this system.';
-  });
-}
-window.applyResolution = function(output) {
-  const sel = document.getElementById('res-sel-'+output);
-  if (!sel) return;
-  const mode = sel.value;
-  const btn = sel.nextElementSibling;
-  btn.disabled = true;
-  btn.textContent = 'Applying...';
-  fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
-    .then(r=>r.json()).then(d=>{
-      if (d.error) { alert('Error: '+d.error); }
-      loadDisplays();
-    }).catch(e=>{ alert('Error: '+e); })
-    .finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
-};
-loadDisplays();
-})();
-</script>`
-}
-
 func renderNvidiaSelfHealInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
 <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
@@ -3086,8 +3375,6 @@ function installToRAM() {
 <div class="card"><div class="card-head">Services</div><div class="card-body">` +
 		renderServicesInline() + `</div></div>

-<div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
-		renderDisplayInline() + `</div></div>

 <script>
 function checkTools() {
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -295,10 +295,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Tools
 	mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)

-	// Display
-	mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
-	mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
-
 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
 	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -741,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	for _, needle := range []string{
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
-		`targeted_stress remain in <a href="/validate">Validate</a>`,
-		`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
+		`NCCL`,
+		`Validate → Stress mode`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
@@ -1094,6 +1094,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
+		// Runtime Health card — LiveCD checks only
 		`Runtime Health`,
 		`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
 		`Export Directory`,
@@ -1102,16 +1103,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 		`CUDA / ROCm`,
 		`Required Utilities`,
 		`Bee Services`,
-		`<td>CPU</td>`,
-		`<td>Memory</td>`,
-		`<td>Storage</td>`,
-		`<td>GPU</td>`,
 		`CUDA runtime is not ready for GPU SAT.`,
 		`Missing: nvidia-smi`,
 		`bee-nvidia=inactive`,
-		`cpu SAT: FAILED`,
-		`storage SAT: FAILED`,
-		`sat:nvidia`,
+		// Hardware Summary card — component health badges
+		`Hardware Summary`,
+		`>CPU<`,
+		`>Memory<`,
+		`>Storage<`,
+		`>GPU<`,
+		`>PSU<`,
+		`badge-warn`,   // cpu Warning badge
+		`badge-err`,    // storage Critical badge
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("dashboard missing %q: %s", needle, body)
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -118,6 +118,7 @@ type taskParams struct {
 	StressMode         bool     `json:"stress_mode,omitempty"`
 	GPUIndices         []int    `json:"gpu_indices,omitempty"`
 	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
+	StaggerGPUStart    bool     `json:"stagger_gpu_start,omitempty"`
 	SizeMB             int      `json:"size_mb,omitempty"`
 	Passes             int      `json:"passes,omitempty"`
 	Loader             string   `json:"loader,omitempty"`
@@ -125,6 +126,9 @@ type taskParams struct {
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
+	RampStep           int      `json:"ramp_step,omitempty"`
+	RampTotal          int      `json:"ramp_total,omitempty"`
+	RampRunID          string   `json:"ramp_run_id,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
@@ -151,6 +155,12 @@ type burnPreset struct {
 	DurationSec int
 }

+type nvidiaRampSpec struct {
+	DurationSec      int
+	StaggerSeconds   int
+	TotalDurationSec int
+}
+
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
@@ -162,6 +172,45 @@ func resolveBurnPreset(profile string) burnPreset {
 	}
 }

+func resolveNvidiaRampPlan(profile string, enabled bool, selected []int) (nvidiaRampSpec, error) {
+	base := resolveBurnPreset(profile).DurationSec
+	plan := nvidiaRampSpec{
+		DurationSec:      base,
+		TotalDurationSec: base,
+	}
+	if !enabled {
+		return plan, nil
+	}
+	count := len(selected)
+	if count == 0 {
+		return nvidiaRampSpec{}, fmt.Errorf("staggered NVIDIA burn requires explicit GPU selection")
+	}
+	if count == 1 {
+		return plan, nil
+	}
+
+	switch profile {
+	case "acceptance":
+		plan.StaggerSeconds = 10 * 60
+		plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
+	case "overnight":
+		plan.StaggerSeconds = 60 * 60
+		plan.TotalDurationSec = 8 * 60 * 60
+		minTotal := count * 60 * 60
+		if plan.TotalDurationSec < minTotal {
+			plan.TotalDurationSec = minTotal
+		}
+		if plan.TotalDurationSec > 10*60*60 {
+			return nvidiaRampSpec{}, fmt.Errorf("overnight staggered NVIDIA burn supports at most 10 GPUs")
+		}
+		plan.DurationSec = plan.TotalDurationSec - plan.StaggerSeconds*(count-1)
+	default:
+		plan.StaggerSeconds = 2 * 60
+		plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
+	}
+	return plan, nil
+}
+
 func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
 	acceptanceCycles := []platform.PlatformStressCycle{
 		{LoadSec: 85, IdleSec: 5},
@@ -591,6 +640,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RunNCCL:           t.params.RunNCCL,
 			ParallelGPUs:      t.params.ParallelGPUs,
+			RampStep:          t.params.RampStep,
+			RampTotal:         t.params.RampTotal,
+			RampRunID:         t.params.RampRunID,
 		}, j.append)
 	case "nvidia-compute":
 		if a == nil {
@@ -601,7 +653,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
-		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+		if planErr != nil {
+			err = planErr
+			break
+		}
+		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+			dur = rampPlan.DurationSec
+		}
+		if rampPlan.StaggerSeconds > 0 {
+			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+		}
+		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
 	case "nvidia-targeted-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -651,11 +714,23 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+		if planErr != nil {
+			err = planErr
+			break
+		}
+		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+			dur = rampPlan.DurationSec
+		}
+		if rampPlan.StaggerSeconds > 0 {
+			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+		}
 		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
 			DurationSec:       dur,
 			Loader:            t.params.Loader,
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			StaggerSeconds:    rampPlan.StaggerSeconds,
 		}, j.append)
 	case "memory":
 		if a == nil {
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -491,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
 	}
 }

+func TestResolveNvidiaRampPlan(t *testing.T) {
+	tests := []struct {
+		name     string
+		profile  string
+		enabled  bool
+		selected []int
+		want     nvidiaRampSpec
+		wantErr  string
+	}{
+		{
+			name:     "disabled uses base preset",
+			profile:  "acceptance",
+			selected: []int{0, 1},
+			want:     nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
+		},
+		{
+			name:     "smoke ramp uses two minute steps",
+			profile:  "smoke",
+			enabled:  true,
+			selected: []int{0, 1, 2},
+			want:     nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
+		},
+		{
+			name:     "acceptance ramp uses ten minute steps",
+			profile:  "acceptance",
+			enabled:  true,
+			selected: []int{0, 1, 2},
+			want:     nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
+		},
+		{
+			name:     "overnight stays at eight hours when possible",
+			profile:  "overnight",
+			enabled:  true,
+			selected: []int{0, 1, 2},
+			want:     nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
+		},
+		{
+			name:     "overnight extends to keep one hour after final gpu",
+			profile:  "overnight",
+			enabled:  true,
+			selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
+			want:     nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
+		},
+		{
+			name:     "overnight rejects impossible gpu count",
+			profile:  "overnight",
+			enabled:  true,
+			selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+			wantErr:  "at most 10 GPUs",
+		},
+		{
+			name:    "enabled requires explicit selection",
+			profile: "smoke",
+			enabled: true,
+			wantErr: "requires explicit GPU selection",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
+			if tc.wantErr != "" {
+				if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
+					t.Fatalf("err=%v want substring %q", err, tc.wantErr)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("resolveNvidiaRampPlan error: %v", err)
+			}
+			if got != tc.want {
+				t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
 	tests := []struct {
 		loader string
--- a/2
+++ b/2
--- a/bible-local/docs/gpu-model-propagation.md
+++ b/bible-local/docs/gpu-model-propagation.md
@@ -0,0 +1,117 @@
+# GPU Model Name Propagation
+
+How GPU model names are detected, stored, and displayed throughout the project.
+
+---
+
+## Detection Sources
+
+There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
+
+### Pipeline A — Live / SAT (nvidia-smi query at runtime)
+
+**File:** `audit/internal/platform/sat.go`
+
+- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
+- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
+- Used by: GPU selection UI, live metrics labels, burn/stress test logic
+
+### Pipeline B — Benchmark results
+
+**File:** `audit/internal/platform/benchmark.go`, line 124
+
+- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
+- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
+- Used by: benchmark history table, benchmark report
+
+### Pipeline C — Hardware audit JSON (PCIe schema)
+
+**File:** `audit/internal/schema/hardware.go`
+
+- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
+- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
+- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
+- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
+
+---
+
+## Key Inconsistency: NVIDIA PCIe Model is Never Set
+
+`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
+
+This means:
+- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
+- AMD GPUs do have their model populated
+
+The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
+
+---
+
+## Benchmark History "Unknown GPU" Issue
+
+**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
+
+**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
+
+This happens for:
+- Older result files saved before the `Name` field was added
+- Runs where nvidia-smi query failed before the benchmark started
+
+---
+
+## Fallback Strings — Current State
+
+| Location | File | Fallback string |
+|---|---|---|
+| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
+| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
+| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
+| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
+| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
+| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
+| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
+| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
+
+**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
+
+---
+
+## GPU Selection UI
+
+**File:** `audit/internal/webui/pages.go`
+
+- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
+- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
+- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
+
+This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
+
+---
+
+## Data Flow Summary
+
+```
+nvidia-smi (live)
+  └─ ListNvidiaGPUs() → NvidiaGPU.Name
+       ├─ GPU selection UI (always correct)
+       ├─ Live metrics labels (charts_svg.go)
+       └─ SAT/burn status file (sat.go)
+
+nvidia-smi (at benchmark start)
+  └─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
+       └─ BenchmarkGPUResult.Name (json:"name,omitempty")
+            ├─ Benchmark report
+            └─ Benchmark history table columns
+
+nvidia-smi / lspci (audit collection)
+  └─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
+       └─ Hardware summary page hwDescribeGPU()
+```
+
+---
+
+## What Needs Fixing
+
+1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
+2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
+3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -11,18 +11,18 @@ echo "  Hardware Audit LiveCD"
 echo ""

 menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }

 submenu "EASY-BEE (advanced options) -->" {
    menuentry "EASY-BEE — GSP=off" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }

    menuentry "EASY-BEE — KMS (no nomodeset)" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }

--- a/iso/builder/config/bootloaders/grub-pc/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/theme.cfg
@@ -1,9 +1,9 @@
 set color_normal=light-gray/black
-set color_highlight=white/dark-gray
+set color_highlight=yellow/black

 if [ -e /boot/grub/splash.png ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
-    set menu_color_normal=cyan/black
-    set menu_color_highlight=white/dark-gray
+    set menu_color_normal=yellow/black
+    set menu_color_highlight=white/brown
 fi
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -3,31 +3,31 @@ label live-@FLAVOUR@-normal
    menu default
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=normal
+    append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1

 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^graphics/KMS)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
+    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1

 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ toram bee.nvidia.mode=normal
+    append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1

 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1

 label live-@FLAVOUR@-kms-gsp-off
    menu label EASY-BEE (g^raphics/KMS, GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
+    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1

 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -25,6 +25,7 @@ ensure_bee_console_user() {
 ensure_bee_console_user

 # Enable common bee services
+systemctl enable bee-hpc-tuning.service
 systemctl enable bee-network.service
 systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
@@ -55,6 +56,7 @@ fi
 # nogpu: no GPU services needed

 # Ensure scripts are executable
+chmod +x /usr/local/bin/bee-hpc-tuning  2>/dev/null || true
 chmod +x /usr/local/bin/bee-network.sh  2>/dev/null || true
 chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -10,20 +10,15 @@ import os

 W, H = 1920, 1080

-GLYPHS = {
-    'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"],
-    'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"],
-    'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"],
-    'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"],
-    'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"],
-    '-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"],
-}
-
-TITLE = "EASY-BEE"
-SUBTITLE = "Hardware Audit LiveCD"
-CELL = 30
-GLYPH_GAP = 18
-ROW_GAP = 6
+ASCII_ART = [
+    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
+    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
+    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
+    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
+    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
+    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
+]
+SUBTITLE = "  Hardware Audit LiveCD"

 FG = (0xF6, 0xD0, 0x47)
 FG_DIM = (0xD4, 0xA9, 0x1C)
@@ -31,6 +26,12 @@ SHADOW = (0x5E, 0x47, 0x05)
 SUB = (0x96, 0x7A, 0x17)
 BG = (0x05, 0x05, 0x05)

+MONO_FONT_CANDIDATES = [
+    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
+    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
+]
 SUB_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
@@ -39,43 +40,34 @@ SUB_FONT_CANDIDATES = [
 ]


-def load_font(size):
-    for path in SUB_FONT_CANDIDATES:
+def load_font(candidates, size):
+    for path in candidates:
        if os.path.exists(path):
            return ImageFont.truetype(path, size)
    return ImageFont.load_default()


-def glyph_width(ch):
-    return len(GLYPHS[ch][0])
+def mono_metrics(font):
+    probe = Image.new('L', (W, H), 0)
+    draw = ImageDraw.Draw(probe)
+    char_w = int(round(draw.textlength("M", font=font)))
+    bb = draw.textbbox((0, 0), "Mg", font=font)
+    char_h = bb[3] - bb[1]
+    return char_w, char_h


-def render_logo_mask():
-    width_cells = 0
-    for idx, ch in enumerate(TITLE):
-        width_cells += glyph_width(ch)
-        if idx != len(TITLE) - 1:
-            width_cells += 1
-    mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
-    mask_h = 7 * CELL + 6 * ROW_GAP
-    mask = Image.new('L', (mask_w, mask_h), 0)
+def render_ascii_mask(font, lines, char_w, char_h, line_gap):
+    width = max(len(line) for line in lines) * char_w
+    height = len(lines) * char_h + line_gap * (len(lines) - 1)
+    mask = Image.new('L', (width, height), 0)
    draw = ImageDraw.Draw(mask)
-
-    cx = 0
-    for idx, ch in enumerate(TITLE):
-        glyph = GLYPHS[ch]
-        for row_idx, row in enumerate(glyph):
-            for col_idx, cell in enumerate(row):
-                if cell != '1':
-                    continue
-                x0 = cx + col_idx * CELL
-                y0 = row_idx * (CELL + ROW_GAP)
-                x1 = x0 + CELL - 4
-                y1 = y0 + CELL - 4
-                draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
-        cx += glyph_width(ch) * CELL
-        if idx != len(TITLE) - 1:
-            cx += CELL + GLYPH_GAP
+    for row, line in enumerate(lines):
+        y = row * (char_h + line_gap)
+        for col, ch in enumerate(line):
+            if ch == ' ':
+                continue
+            x = col * char_w
+            draw.text((x, y), ch, font=font, fill=255)
    return mask


@@ -90,20 +82,28 @@ glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
 glow = glow.filter(ImageFilter.GaussianBlur(60))
 img = Image.alpha_composite(img.convert('RGBA'), glow)

-logo_mask = render_logo_mask()
+TARGET_LOGO_W = 400
+max_chars = max(len(line) for line in ASCII_ART)
+_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
+_probe_cw, _ = mono_metrics(_probe_font)
+font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
+font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
+char_w, char_h = mono_metrics(font_logo)
+logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
 logo_w, logo_h = logo_mask.size
 logo_x = (W - logo_w) // 2
-logo_y = 290
+logo_y = 380

-shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
-img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
-img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
+sh_off = max(1, font_size_logo // 6)
+shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
+img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
+img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
 img.paste(FG, (logo_x, logo_y), logo_mask)

-font_sub = load_font(30)
+font_sub = load_font(SUB_FONT_CANDIDATES, 30)
 sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
 sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
-sub_y = logo_y + logo_h + 54
+sub_y = logo_y + logo_h + 48
 draw = ImageDraw.Draw(img)
 draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
 draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
--- a/iso/overlay/etc/systemd/system/bee-hpc-tuning.service
+++ b/iso/overlay/etc/systemd/system/bee-hpc-tuning.service
@@ -0,0 +1,14 @@
+[Unit]
+Description=Bee: HPC tuning (CPU governor, C-states)
+After=local-fs.target
+Before=bee-nvidia.service bee-audit.service
+
+[Service]
+Type=oneshot
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-hpc-tuning.log /usr/local/bin/bee-hpc-tuning
+StandardOutput=journal
+StandardError=journal
+RemainAfterExit=yes
+
+[Install]
+WantedBy=multi-user.target
--- a/iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
+++ b/iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
@@ -0,0 +1,110 @@
+#!/bin/sh
+set -eu
+
+SECONDS=300
+STAGGER_SECONDS=180
+DEVICES=""
+EXCLUDE=""
+
+usage() {
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
+    exit 2
+}
+
+normalize_list() {
+    echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
+}
+
+contains_csv() {
+    needle="$1"
+    haystack="${2:-}"
+    echo ",${haystack}," | grep -q ",${needle},"
+}
+
+resolve_dcgmproftester() {
+    for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
+        if command -v "${candidate}" >/dev/null 2>&1; then
+            command -v "${candidate}"
+            return 0
+        fi
+    done
+    return 1
+}
+
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
+        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
+        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+
+PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
+ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
+[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
+
+DEVICES=$(normalize_list "${DEVICES}")
+EXCLUDE=$(normalize_list "${EXCLUDE}")
+SELECTED="${DEVICES}"
+if [ -z "${SELECTED}" ]; then
+    SELECTED="${ALL_DEVICES}"
+fi
+
+FINAL=""
+for id in $(echo "${SELECTED}" | tr ',' ' '); do
+    [ -n "${id}" ] || continue
+    if contains_csv "${id}" "${EXCLUDE}"; then
+        continue
+    fi
+    if [ -z "${FINAL}" ]; then
+        FINAL="${id}"
+    else
+        FINAL="${FINAL},${id}"
+    fi
+done
+
+[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
+
+echo "loader=dcgmproftester-staggered"
+echo "selected_gpus=${FINAL}"
+echo "stagger_seconds=${STAGGER_SECONDS}"
+
+TMP_DIR=$(mktemp -d)
+trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
+
+GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
+gpu_pos=0
+WORKERS=""
+for id in $(echo "${FINAL}" | tr ',' ' '); do
+    gpu_pos=$((gpu_pos + 1))
+    log="${TMP_DIR}/gpu-${id}.log"
+    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
+    gpu_seconds=$(( SECONDS + extra_sec ))
+    echo "starting gpu ${id} seconds=${gpu_seconds}"
+    CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
+    pid=$!
+    WORKERS="${WORKERS} ${pid}:${id}:${log}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
+done
+
+status=0
+for spec in ${WORKERS}; do
+    pid=${spec%%:*}
+    rest=${spec#*:}
+    id=${rest%%:*}
+    log=${rest#*:}
+    if wait "${pid}"; then
+        echo "gpu ${id} finished: OK"
+    else
+        rc=$?
+        echo "gpu ${id} finished: FAILED rc=${rc}"
+        status=1
+    fi
+    sed "s/^/[gpu ${id}] /" "${log}" || true
+done
+
+exit "${status}"
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -2,13 +2,14 @@
 set -eu

 SECONDS=5
+STAGGER_SECONDS=0
 SIZE_MB=0
 DEVICES=""
 EXCLUDE=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"

 usage() {
-    echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
    exit 2
 }

@@ -25,6 +26,7 @@ contains_csv() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
@@ -61,14 +63,18 @@ done

 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"
+echo "stagger_seconds=${STAGGER_SECONDS}"

 export CUDA_DEVICE_ORDER="PCI_BUS_ID"

 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM

+GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
+gpu_pos=0
 WORKERS=""
 for id in $(echo "${FINAL}" | tr ',' ' '); do
+    gpu_pos=$((gpu_pos + 1))
    log="${TMP_DIR}/gpu-${id}.log"
    gpu_size_mb="${SIZE_MB}"
    if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
            gpu_size_mb=512
        fi
    fi
-    echo "starting gpu ${id} size=${gpu_size_mb}MB"
+    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
+    gpu_seconds=$(( SECONDS + extra_sec ))
+    echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
    CUDA_VISIBLE_DEVICES="${id}" \
-        "${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
 done

 status=0
--- a/iso/overlay/usr/local/bin/bee-hpc-tuning
+++ b/iso/overlay/usr/local/bin/bee-hpc-tuning
@@ -0,0 +1,41 @@
+#!/bin/sh
+# bee-hpc-tuning — apply HPC tuning for deterministic benchmarking
+# Called by bee-hpc-tuning.service at boot.
+
+log() { echo "[bee-hpc-tuning] $*"; }
+
+# ── CPU governor ────────────────────────────────────────────────────────────
+# Set all CPU cores to performance governor via sysfs.
+# cpupower is not available; write directly to scaling_governor.
+governor_ok=0
+governor_fail=0
+for gov_path in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
+    [ -f "$gov_path" ] || continue
+    if echo performance > "$gov_path" 2>/dev/null; then
+        governor_ok=$((governor_ok + 1))
+    else
+        governor_fail=$((governor_fail + 1))
+    fi
+done
+
+if [ "$governor_ok" -gt 0 ] && [ "$governor_fail" -eq 0 ]; then
+    log "CPU governor set to performance on ${governor_ok} core(s)"
+elif [ "$governor_ok" -gt 0 ]; then
+    log "WARN: CPU governor: ${governor_ok} OK, ${governor_fail} failed"
+elif [ "$governor_fail" -gt 0 ]; then
+    log "WARN: failed to set CPU governor on ${governor_fail} core(s)"
+else
+    log "WARN: no cpufreq scaling_governor paths found (C-state governor or HW-controlled)"
+fi
+
+# ── Transparent Huge Pages ───────────────────────────────────────────────────
+# Kernel cmdline sets transparent_hugepage=always at boot, but confirm and log.
+thp_path=/sys/kernel/mm/transparent_hugepage/enabled
+if [ -f "$thp_path" ]; then
+    current=$(cat "$thp_path" 2>/dev/null)
+    log "transparent_hugepage: ${current}"
+else
+    log "WARN: transparent_hugepage sysfs path not found"
+fi
+
+log "done"
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -2,6 +2,7 @@
 set -eu

 DURATION_SEC=300
+STAGGER_SECONDS=0
 DEVICES=""
 EXCLUDE=""
 FORMAT=""
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
 export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"

 usage() {
-    echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
    exit 2
 }

@@ -118,6 +119,7 @@ ensure_opencl_ready() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -170,6 +172,7 @@ done
 echo "loader=john"
 echo "selected_gpus=${FINAL}"
 echo "john_devices=${JOHN_DEVICES}"
+echo "stagger_seconds=${STAGGER_SECONDS}"

 cd "${JOHN_DIR}"

@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
 echo "format=${CHOSEN_FORMAT}"
 echo "target_seconds=${DURATION_SEC}"
 echo "slice_seconds=${TEST_SLICE_SECONDS}"
-DEADLINE=$(( $(date +%s) + DURATION_SEC ))
+TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
 _first=1
+pos=0
 for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
+    pos=$((pos + 1))
    [ "${_first}" = "1" ] || sleep 3
    _first=0
-    run_john_loop "${opencl_id}" "${DEADLINE}" &
+    extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
+    deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
+    run_john_loop "${opencl_id}" "${deadline}" &
    pid=$!
    PIDS="${PIDS} ${pid}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
 done
 FAIL=0
 for pid in ${PIDS}; do
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {

 log "kernel: $(uname -r)"

-# Skip if no NVIDIA GPU present (PCI vendor 10de)
-if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
+# Skip if no NVIDIA display/compute GPU is present.
+# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
+have_nvidia_gpu() {
+    lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
+}
+
+if ! have_nvidia_gpu; then
    log "no NVIDIA GPU detected — skipping module load"
    exit 0
 fi
--- a/iso/overlay/usr/local/bin/bee-selfheal
+++ b/iso/overlay/usr/local/bin/bee-selfheal
@@ -14,7 +14,7 @@ log() {
 }

 have_nvidia_gpu() {
-    lspci -nn 2>/dev/null | grep -qi '10de:'
+    lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
 }

 service_active() {
Author	SHA1	Message	Date
Michael Chus	02e44b1172	Fix USB/RAM status checks; add server model+S/N to dashboard; remove cycles USB Export Drive: lsblk reports TRAN only for whole disks, not partitions (/dev/sdc1). Strip trailing partition digits to get parent disk before transport check. LiveCD in RAM: When RunInstallToRAM copies squashfs to /dev/shm/bee-live/ but bind-mount of /run/live/medium fails (CD-ROM boots), /run/live/medium still shows the CD-ROM fstype. Add fallback: if /dev/shm/bee-live/*.squashfs exists, the data is in RAM — report status OK. Dashboard Hardware Summary: Show server Manufacturer + ProductName as heading and S/N as subline above the component table, sourced from hw.Board (dmidecode system-type data). Validate: Remove Cycles input — always run once. cycles=1 hardcoded in runAllSAT(). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:46:42 +03:00
Michael Chus	2ceaa0d0ca	Include profile and mode in benchmark task names for task list clarity Task names now follow the pattern: NVIDIA Benchmark · <profile> · <mode> [· GPU <indices>] Examples: NVIDIA Benchmark · standard · sequential (GPU 0, RTX 6000 Pro) NVIDIA Benchmark · stability · parallel NVIDIA Benchmark · standard · ramp 1/4 · GPU 0 NVIDIA Benchmark · standard · ramp 2/4 · GPU 0,1 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:36:51 +03:00
Michael Chus	9482ba20a2	Remove NCCL checkbox — auto-enable interconnect step when >1 GPU selected NCCL all_reduce is always attempted when 2+ GPUs are selected; a failure leaves InterconnectScore=0 (no bonus, no penalty) and OverallStatus unaffected. Exposing the checkbox implied NCCL is optional and made a failed run look like a deliberate skip. - Remove benchmark-run-nccl checkbox and its change listener from pages.go - Client sends run_nccl: selected.length > 1 (automatic) - api.go default runNCCL=true is unchanged - Selection note now mentions NCCL automatically for multi-GPU runs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:33:17 +03:00
Michael Chus	813e2f86a9	Add scalability/ramp-up labeling, ServerPower penalty in scoring, and report improvements - Add RampStep/RampTotal/RampRunID to NvidiaBenchmarkOptions, taskParams, and NvidiaBenchmarkResult so ramp-up steps can be correlated across result.json files - Add ScalabilityScore field to NvidiaBenchmarkResult (placeholder; computed externally by comparing ramp-up step results sharing the same ramp_run_id) - Propagate ramp fields through api.go (generates shared ramp_run_id at spawn time), tasks.go handler, and benchmark.go result population - Apply ServerPower penalty to CompositeScore when IPMI reporting_ratio < 0.75: factor = ratio/0.75, applied per-GPU with a note explaining the reduction - Add finding when server power delta exceeds GPU-reported sum by >25% (non-GPU draw) - Report header now shows ramp step N/M and run ID instead of "parallel" when in ramp mode; shows scalability_score when non-zero Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:30:47 +03:00
Michael Chus	58a6da9b44	Recover power limits and SM count from nvidia-smi -q in enrichGPUInfo When --query-gpu CSV fields fail (exit status 2 on some Blackwell + driver combos), enrichGPUInfoWithMaxClocks now also parses from the verbose nvidia-smi -q output already collected at benchmark start: - Default Power Limit → DefaultPowerLimitW - Current Power Limit → PowerLimitW (fallback) - Multiprocessor Count → MultiprocessorCount Fixes PowerSustainScore=0 on systems where all three CSV query variants fail but nvidia-smi -q succeeds (confirmed on RTX PRO 6000 Blackwell + driver 590.48.01). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:17:56 +03:00
Michael Chus	f4a19c0a00	Add power calibration step to benchmark; fix PowerSustainScore reference Before the per-GPU compute phases, run `dcgmi diag -r targeted_power` for 45 s while collecting nvidia-smi power metrics in parallel. The p95 power per GPU is stored as calibrated_peak_power_w and used as the denominator for PowerSustainScore instead of the hardware default limit, which bee-gpu-burn cannot reach because it is compute-only. Fallback chain: calibrated peak → default limit → enforced limit. If dcgmi is absent or the run fails, calibration is skipped silently. Adjust composite score weights to match the new honest power reference: base 0.35, thermal 0.25, stability 0.25, power 0.15, NCCL bonus 0.10. Power weight reduced (0.20→0.15) because even with a calibrated reference bee-gpu-burn reaches ~60-75% of TDP by design (no concurrent mem stress). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 22:06:46 +03:00
Michael Chus	9e3dcf9b4d	Record host CPU/RAM config in benchmark results; check CPU load - BenchmarkHostConfig captures CPU model, sockets, cores, threads, and total RAM from /proc/cpuinfo and /proc/meminfo at benchmark start. - BenchmarkCPULoad samples host CPU utilisation every 10 s throughout the GPU steady-state phase (sequential and parallel paths). - Summarises avg/max/p95 and classifies status as ok / high / unstable. - Adds a finding when CPU load is elevated (avg >20% or max >40%) or erratic (stddev >12%), with a plain-English description in the report. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 20:02:04 +03:00
Michael Chus	098e19f760	Add ramp-up mode to NVIDIA GPU benchmark Adds a new checkbox (enabled by default) in the benchmark section. In ramp-up mode N tasks are spawned simultaneously: 1 GPU, then 2, then 3, up to all selected GPUs — each step runs its GPUs in parallel. NCCL runs only on the final step. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 18:34:19 +03:00
Michael Chus	e16d0f34b5	Adjust burn GPU ramp timing by profile	2026-04-12 15:58:30 +03:00
Mikhail Chusavitin	525ed8b8fc	Fix GPU clock lock normalization for Blackwell (clocks.max.* unsupported) clocks.max.graphics / clocks.max.memory CSV fields return exit status 2 on RTX PRO 6000 Blackwell (driver 98.x), causing the entire gpu inventory query to fail and clock lock to be skipped → normalization: partial. Fix: - Add minimal fallback query (index,uuid,name,pci.bus_id,vbios_version, power.limit) that succeeds even without clock fields - Add enrichGPUInfoWithMaxClocks: parses "Max Clocks" section of nvidia-smi -q verbose output to fill MaxGraphicsClockMHz / MaxMemoryClockMHz when CSV fields fail - Move nvidia-smi -q execution before queryBenchmarkGPUInfo so its output is available for clock enrichment immediately after - Tests: cover enrichment and skip-if-populated cases Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 13:33:54 +03:00
Mikhail Chusavitin	4f94ebcb2c	Add HPC tuning: PCIe ASPM off, C-states, performance CPU governor - grub.cfg + isolinux/live.cfg.in: add pcie_aspm=off, intel_idle.max_cstate=1 and processor.max_cstate=1 to all non-failsafe boot entries - bee-hpc-tuning: new script that sets all CPU cores to performance governor via sysfs and logs THP state at boot - bee-hpc-tuning.service: runs before bee-nvidia and bee-audit - 9000-bee-setup.hook.chroot: enable service and mark script executable Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 13:07:32 +03:00
Mikhail Chusavitin	05c1fde233	Warn on PCIe link speed degradation and collect lspci -vvv in techdump - collector/pcie: add applyPCIeLinkSpeedWarning that sets status=Warning and ErrorDescription when current link speed is below maximum negotiated speed (e.g. Gen1 running on a Gen5 slot) - collector/pcie: add pcieLinkSpeedRank helper for Gen string comparison - collector/pcie_filter_test: cover degraded and healthy link speed cases - platform/techdump: collect lspci -vvv → lspci-vvv.txt for LnkCap/LnkSta Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-12 12:42:17 +03:00
Michael Chus	825ef6b98a	Add USB export drive and LiveCD-in-RAM checks to Runtime Health - schema: add ToRAMStatus and USBExportPath fields to RuntimeHealth - platform/runtime.go: collectToRAMHealth (ok/warning/failed based on IsLiveMediaInRAM + toramActive) and collectUSBExportHealth (scans /proc/mounts + lsblk for writable USB-backed filesystems) - pages.go: add USB Export Drive and LiveCD in RAM rows to the health table Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-11 10:05:27 +03:00
Michael Chus	ba16021cdb	Fix GPU model propagation, export filenames, PSU/service status, and chart perf - nvidia.go: add Name field to nvidiaGPUInfo, include model name in nvidia-smi query, set dev.Model in enrichPCIeWithNVIDIAData - pages.go: fix duplicate GPU count in validate card summary (4 GPU: 4 x … → 4 x … GPU); fix PSU UNKNOWN fallback from hw.PowerSupplies; treat activating/deactivating/reloading service states as OK in Runtime Health - support_bundle.go: use "150405" time format (no colons) for exFAT compat - sat.go / benchmark.go / platform_stress.go / sat_fan_stress.go: remove .tar.gz archive creation from export dirs — export packs everything itself - charts_svg.go: add min-max downsampling (1400 pt cap) for SVG chart perf - benchmark_report.go / sat.go: normalize GPU fallback to "Unknown GPU" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-11 10:05:27 +03:00
Mikhail Chusavitin	bb1218ddd4	Fix GPU inventory: exclude BMC virtual VGA, show real NVIDIA model names Two issues: 1. BMC/management VGA chips (e.g. Huawei iBMC Hi171x, ASPEED) were included in GPU inventory because shouldIncludePCIeDevice only checked the PCI class, not the device name. Added a name-based filter for known BMC/management patterns when the class is VGA/display/3d. 2. New NVIDIA GPUs (e.g. RTX PRO 6000 Blackwell, device ID 2bb5) showed as "Device 2bb5" because lspci's database lags behind. Added "name" to the nvidia-smi query and use it to override dev.Model during enrichment. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-10 13:57:26 +03:00
Mikhail Chusavitin	65faae8ede	Remove hpl from SAT run-all targets — no backend route exists hpl was listed in baseTargets and stressOnlyTargets but /api/sat/hpl/run was never registered, causing a 405 Method Not Allowed (not valid JSON) error when Validate one by one was triggered in stress mode. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-10 13:30:32 +03:00
Michael Chus	05241f2e0e	Redesign dashboard: split Runtime Health and Hardware Summary - Runtime Health now shows only LiveCD system status (services, tools, drivers, network, CUDA/ROCm) — hardware component rows removed - Hardware Summary now shows server components with readable descriptions (model, count×size) and component-status.json health badges - Add Network Adapters row to Hardware Summary - SFP module static info (vendor, PN, SN, connector, type, wavelength) now collected via ethtool -m regardless of carrier state - PSU statuses from IPMI audit written to component-status.json so PSU badge shows actual status after first audit instead of UNKNOWN Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-09 23:41:23 +03:00
Mikhail Chusavitin	c1690a084b	Fix app tests that mutate global defaults	2026-04-09 15:28:25 +03:00
Mikhail Chusavitin	9481ca2805	Add staged NVIDIA burn ramp-up mode	2026-04-09 15:21:14 +03:00
Mikhail Chusavitin	a78fdadd88	Refine validate and burn profile layout	2026-04-09 15:14:48 +03:00
Mikhail Chusavitin	4ef403898f	Tighten NVIDIA GPU PCI detection	2026-04-09 15:14:48 +03:00
Michael Chus	025548ab3c	UI: amber accents, smaller wallpaper logo, new support bundle name, drop display resolution - Bootloader: GRUB fallback text colors → yellow/brown (amber tone) - CLI charts: all GPU metric series use single amber color (xterm-256 #214) - Wallpaper: logo width scaled to 400 px dynamically, shadow scales with font size - Support bundle: renamed to YYYY-MM-DD (BEE-SP vX.X) SRV_MODEL SRV_SN ToD.tar.gz using dmidecode for server model (spaces→underscores) and serial number - Remove display resolution feature (UI card, API routes, handlers, tests) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 21:37:01 +03:00
Mikhail Chusavitin	e0d94d7f47	Remove HPL from build and audit flows	2026-04-08 10:00:23 +03:00
Mikhail Chusavitin	13899aa864	Drop incompatible HPL git fallback	2026-04-08 09:50:58 +03:00
Mikhail Chusavitin	f345d8a89d	Build HPL serially to avoid upstream make races	2026-04-08 09:47:35 +03:00
Mikhail Chusavitin	4715059ac0	Fix HPL MPI stub header and keep full build logs	2026-04-08 09:45:14 +03:00
Mikhail Chusavitin	0660a40287	Harden HPL builder cache and runtime libs	2026-04-08 09:40:18 +03:00
Mikhail Chusavitin	67369d9b7b	Fix OpenBLAS package lookup in HPL build	2026-04-08 09:32:49 +03:00
Mikhail Chusavitin	3f41a026ca	Add resilient HPL source fallbacks	2026-04-08 09:25:31 +03:00
Mikhail Chusavitin	0ee4f46537	Restore MOTD-style ASCII wallpaper	2026-04-08 09:14:27 +03:00
Michael Chus	8db40b098a	Update bible submodule Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 07:14:31 +03:00
Michael Chus	16e7ae00e7	Add HPL (LINPACK) benchmark as validate/stress task HPL 2.3 from netlib compiled against OpenBLAS with a minimal single-process MPI stub — no MPI package required in the ISO. Matrix size is auto-sized to 80% of total RAM at runtime. Build: - VERSIONS: HPL_VERSION=2.3, HPL_SHA256=32c5c17d… - build-hpl.sh: downloads HPL + OpenBLAS from Debian 12 repo, compiles xhpl with a self-contained mpi_stub.c - build.sh: step 80-hpl, injects xhpl + libopenblas into overlay Runtime: - bee-hpl: generates HPL.dat (N auto from /proc/meminfo, NB=256, P=1 Q=1), runs xhpl, prints standard WR... Gflops output - platform/hpl.go: RunHPL(), parses WR line → GFlops + PASSED/FAILED - tasks.go: target "hpl" - pages.go: LINPACK (HPL) card in validate/stress grid (stress-only) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 07:08:18 +03:00