Fix app tests that mutate global defaults

Add staged NVIDIA burn ramp-up mode
Refine validate and burn profile layout
2026-04-09 15:28:25 +03:00 · 2026-04-09 15:21:14 +03:00 · 2026-04-09 15:14:48 +03:00 · 2026-04-09 15:14:48 +03:00 · 2026-04-08 21:37:01 +03:00 · 2026-04-08 10:00:23 +03:00
29 changed files with 970 additions and 658 deletions
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 			archive, err = application.RunNvidiaAcceptancePack("", logLine)
 		}
 	case "memory":
-		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
+		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
 	case "storage":
-		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
+		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
 	case "cpu":
 		dur := *duration
 		if dur <= 0 {
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -117,15 +117,15 @@ type satRunner interface {
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
-	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
 	ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
 	ResetNvidiaGPU(index int) (string, error)
-	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
-	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
+	RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
 	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
 	DetectGPUVendor() string
@@ -566,11 +566,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }

-func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
 }

 func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -602,14 +602,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
 }

 func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
-	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
+	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
 }

-func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
+	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
 }

 func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
@@ -634,14 +634,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
 }

 func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
-	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
+	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
 }

-func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
+	return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
 }

 func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
 	return f.runNvidiaFn(baseDir)
 }

-func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
 	if f.runNvidiaComputeFn != nil {
 		return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
 	}
@@ -217,11 +217,11 @@ func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
 	return "", nil
 }

-func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
 	return f.runMemoryFn(baseDir)
 }

-func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
 	return f.runStorageFn(baseDir)
 }

@@ -542,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
 }

 func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -580,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 }

 func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -643,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 }

 func TestRunSATDefaultsToExportDir(t *testing.T) {
-	t.Parallel()
-
 	oldSATBaseDir := DefaultSATBaseDir
 	DefaultSATBaseDir = "/tmp/export/bee-sat"
 	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -54,7 +54,7 @@ if ! command -v lspci >/dev/null 2>&1; then
  exit 0
 fi
 found=0
-for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
+	for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
  found=1
  echo "=== GPU $gpu ==="
  lspci -s "$gpu" -vv 2>&1 || true
@@ -73,8 +73,13 @@ fi
 	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
 for d in /sys/bus/pci/devices/*/; do
  vendor=$(cat "$d/vendor" 2>/dev/null)
-  [ "$vendor" = "0x10de" ] || continue
-  dev=$(basename "$d")
+	  [ "$vendor" = "0x10de" ] || continue
+	  class=$(cat "$d/class" 2>/dev/null)
+	  case "$class" in
+	    0x030000|0x030200) ;;
+	    *) continue ;;
+	  esac
+	  dev=$(basename "$d")
  echo "=== $dev ==="
  for f in current_link_speed current_link_width max_link_speed max_link_width; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
@@ -192,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
 }

-const supportBundleGlob = "bee-support-*.tar.gz"
+const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"

 func BuildSupportBundle(exportDir string) (string, error) {
 	exportDir = strings.TrimSpace(exportDir)
@@ -206,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	host := sanitizeFilename(hostnameOr("unknown"))
-	ts := time.Now().UTC().Format("20060102-150405")
-	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
+	now := time.Now().UTC()
+	date := now.Format("2006-01-02")
+	tod := now.Format("15:04:05")
+	ver := bundleVersion()
+	model := serverModelForBundle()
+	sn := serverSerialForBundle()
+
+	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
 		return "", err
 	}
@@ -240,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
+	archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
+	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
 	}
@@ -397,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	return os.WriteFile(dst, []byte(body.String()), 0644)
 }

+func bundleVersion() string {
+	v := buildVersion()
+	v = strings.TrimPrefix(v, "v")
+	v = strings.TrimPrefix(v, "V")
+	if v == "" || v == "unknown" {
+		return "0.0"
+	}
+	return v
+}
+
+func serverModelForBundle() string {
+	raw, err := exec.Command("dmidecode", "-t", "1").Output()
+	if err != nil {
+		return "unknown"
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		key, val, ok := strings.Cut(line, ": ")
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(key) == "Product Name" {
+			val = strings.TrimSpace(val)
+			if val == "" {
+				return "unknown"
+			}
+			return strings.ReplaceAll(val, " ", "_")
+		}
+	}
+	return "unknown"
+}
+
+func serverSerialForBundle() string {
+	raw, err := exec.Command("dmidecode", "-t", "1").Output()
+	if err != nil {
+		return "unknown"
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		key, val, ok := strings.Cut(line, ": ")
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(key) == "Serial Number" {
+			val = strings.TrimSpace(val)
+			if val == "" {
+				return "unknown"
+			}
+			return val
+		}
+	}
+	return "unknown"
+}
+
 func buildVersion() string {
 	raw, err := exec.Command("bee", "version").CombinedOutput()
 	if err != nil {
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -326,8 +326,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	}

 	report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
-	if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
-		return "", fmt.Errorf("write report.txt: %w", err)
+	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
+		return "", fmt.Errorf("write report.md: %w", err)
 	}

 	summary := renderBenchmarkSummary(result)
@@ -1183,18 +1183,8 @@ func queryIPMIServerPowerW() (float64, error) {
 	if err != nil {
 		return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
 	}
-	for _, line := range strings.Split(string(out), "\n") {
-		if strings.Contains(line, "Current Power") {
-			parts := strings.SplitN(line, ":", 2)
-			if len(parts) == 2 {
-				val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
-				val = strings.TrimSpace(val)
-				w, err := strconv.ParseFloat(val, 64)
-				if err == nil && w > 0 {
-					return w, nil
-				}
-			}
-		}
+	if w := parseDCMIPowerReading(string(out)); w > 0 {
+		return w, nil
 	}
 	return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
 }
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -22,18 +22,53 @@ var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)

 func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
 	var b strings.Builder
-	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
-	fmt.Fprintf(&b, "===========================\n\n")
-	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
-	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
-	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
-	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
-	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)

+	// ── Header ────────────────────────────────────────────────────────────────
+	b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
+
+	// System identity block
+	if result.ServerModel != "" {
+		fmt.Fprintf(&b, "**Server:** %s  \n", result.ServerModel)
+	}
+	if result.Hostname != "" {
+		fmt.Fprintf(&b, "**Host:** %s  \n", result.Hostname)
+	}
+	// GPU models summary
+	if len(result.GPUs) > 0 {
+		modelCount := make(map[string]int)
+		var modelOrder []string
+		for _, g := range result.GPUs {
+			m := strings.TrimSpace(g.Name)
+			if m == "" {
+				m = "Unknown GPU"
+			}
+			if modelCount[m] == 0 {
+				modelOrder = append(modelOrder, m)
+			}
+			modelCount[m]++
+		}
+		var parts []string
+		for _, m := range modelOrder {
+			if modelCount[m] == 1 {
+				parts = append(parts, m)
+			} else {
+				parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
+			}
+		}
+		fmt.Fprintf(&b, "**GPU(s):** %s  \n", strings.Join(parts, ", "))
+	}
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	if result.ParallelGPUs {
+		fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously)  \n")
+	}
+	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
+	b.WriteString("\n")
+
+	// ── Executive Summary ─────────────────────────────────────────────────────
 	if len(result.Findings) > 0 {
-		fmt.Fprintf(&b, "Executive Summary\n")
-		fmt.Fprintf(&b, "-----------------\n")
+		b.WriteString("## Executive Summary\n\n")
 		for _, finding := range result.Findings {
 			fmt.Fprintf(&b, "- %s\n", finding)
 		}
@@ -41,149 +76,206 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 	}

 	if len(result.Warnings) > 0 {
-		fmt.Fprintf(&b, "Warnings\n")
-		fmt.Fprintf(&b, "--------\n")
+		b.WriteString("## Warnings\n\n")
 		for _, warning := range result.Warnings {
 			fmt.Fprintf(&b, "- %s\n", warning)
 		}
 		b.WriteString("\n")
 	}

-	fmt.Fprintf(&b, "Per GPU Scorecard\n")
-	fmt.Fprintf(&b, "-----------------\n")
+	// ── Scorecard table ───────────────────────────────────────────────────────
+	b.WriteString("## Scorecard\n\n")
+	b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
+	b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
 	for _, gpu := range result.GPUs {
-		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
-		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
-		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
-		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
-		if gpu.Scores.TOPSPerSMPerGHz > 0 {
-			fmt.Fprintf(&b, "  Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
+		name := strings.TrimSpace(gpu.Name)
+		if name == "" {
+			name = "Unknown"
 		}
-		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
-		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
-		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
+		interconnect := "-"
 		if gpu.Scores.InterconnectScore > 0 {
-			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
+			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
 		}
-		if len(gpu.DegradationReasons) > 0 {
-			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
+		topsPerSM := "-"
+		if gpu.Scores.TOPSPerSMPerGHz > 0 {
+			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
 		}
-		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
-		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
-		if len(gpu.PrecisionResults) > 0 {
-			fmt.Fprintf(&b, "  Precision results:\n")
-			for _, precision := range gpu.PrecisionResults {
-				if precision.Supported {
-					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
-				} else {
-					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
-				}
-			}
+		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
+			gpu.Index, name,
+			gpu.Status,
+			gpu.Scores.CompositeScore,
+			gpu.Scores.ComputeScore,
+			topsPerSM,
+			gpu.Scores.PowerSustainScore,
+			gpu.Scores.ThermalSustainScore,
+			gpu.Scores.StabilityScore,
+			interconnect,
+		)
+	}
+	b.WriteString("\n")
+
+	// ── Per GPU detail ────────────────────────────────────────────────────────
+	b.WriteString("## Per-GPU Details\n\n")
+	for _, gpu := range result.GPUs {
+		name := strings.TrimSpace(gpu.Name)
+		if name == "" {
+			name = "Unknown GPU"
 		}
-		fmt.Fprintf(&b, "  Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
-		if len(gpu.Notes) > 0 {
-			fmt.Fprintf(&b, "  Notes:\n")
-			for _, note := range gpu.Notes {
-				fmt.Fprintf(&b, "    - %s\n", note)
-			}
+		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
+
+		// Identity
+		if gpu.BusID != "" {
+			fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
+		}
+		if gpu.VBIOS != "" {
+			fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
+		}
+		if gpu.ComputeCapability != "" {
+			fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
+		}
+		if gpu.MultiprocessorCount > 0 {
+			fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
+		}
+		if gpu.PowerLimitW > 0 {
+			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
+		}
+		if gpu.LockedGraphicsClockMHz > 0 {
+			fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
 		}
 		b.WriteString("\n")
+
+		// Steady-state telemetry
+		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
+		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
+		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
+		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
+		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
+		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+		b.WriteString("\n")
+
+		// Throttle
+		throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
+		if throttle != "none" {
+			fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
+		}
+
+		// Precision results
+		if len(gpu.PrecisionResults) > 0 {
+			b.WriteString("**Precision results:**\n\n")
+			b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
+			for _, p := range gpu.PrecisionResults {
+				if p.Supported {
+					fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
+				} else {
+					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
+				}
+			}
+			b.WriteString("\n")
+		}
+
+		// Degradation / Notes
+		if len(gpu.DegradationReasons) > 0 {
+			fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
+		}
+		if len(gpu.Notes) > 0 {
+			b.WriteString("**Notes:**\n\n")
+			for _, note := range gpu.Notes {
+				fmt.Fprintf(&b, "- %s\n", note)
+			}
+			b.WriteString("\n")
+		}
 	}

+	// ── Interconnect ──────────────────────────────────────────────────────────
 	if result.Interconnect != nil {
-		fmt.Fprintf(&b, "Interconnect\n")
-		fmt.Fprintf(&b, "------------\n")
-		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
+		b.WriteString("## Interconnect (NCCL)\n\n")
+		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
-			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
-			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
+			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
+			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
-		b.WriteString("\n")
+		if len(result.Interconnect.Notes) > 0 {
+			b.WriteString("\n")
+		}
 	}

+	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	if sp := result.ServerPower; sp != nil {
+		b.WriteString("## Server Power (IPMI)\n\n")
+		if !sp.Available {
+			b.WriteString("IPMI power measurement unavailable.\n\n")
+		} else {
+			b.WriteString("| | Value |\n|---|---|\n")
+			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
+			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
+			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
+			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
+			if sp.ReportingRatio > 0 {
+				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
+			}
+			b.WriteString("\n")
+		}
+		for _, note := range sp.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		if len(sp.Notes) > 0 {
+			b.WriteString("\n")
+		}
+	}
+
+	// ── Terminal charts (steady-state only) ───────────────────────────────────
 	if len(charts) > 0 {
-		fmt.Fprintf(&b, "Terminal Charts\n")
-		fmt.Fprintf(&b, "---------------\n")
+		b.WriteString("## Steady-State Charts\n\n")
 		for _, chart := range charts {
 			content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
 			if content == "" {
 				continue
 			}
-			fmt.Fprintf(&b, "%s\n", chart.Title)
-			fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
-			fmt.Fprintf(&b, "%s\n\n", content)
+			fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
 		}
 	}

-	if sp := result.ServerPower; sp != nil {
-		fmt.Fprintf(&b, "Server Power (IPMI)\n")
-		fmt.Fprintf(&b, "-------------------\n")
-		if !sp.Available {
-			fmt.Fprintf(&b, "Unavailable\n")
-		} else {
-			fmt.Fprintf(&b, "  Server idle:         %.0f W\n", sp.IdleW)
-			fmt.Fprintf(&b, "  Server under load:   %.0f W\n", sp.LoadedW)
-			fmt.Fprintf(&b, "  Server delta:        %.0f W\n", sp.DeltaW)
-			fmt.Fprintf(&b, "  GPU reported (sum):  %.0f W\n", sp.GPUReportedSumW)
-			if sp.ReportingRatio > 0 {
-				fmt.Fprintf(&b, "  Reporting ratio:     %.2f  (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
-			}
-		}
-		for _, note := range sp.Notes {
-			fmt.Fprintf(&b, "  Note: %s\n", note)
-		}
-		b.WriteString("\n")
-	}
+	// ── Methodology ───────────────────────────────────────────────────────────
+	b.WriteString("## Methodology\n\n")
+	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
+	b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
+	b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
+	b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")

-	fmt.Fprintf(&b, "Methodology\n")
-	fmt.Fprintf(&b, "-----------\n")
-	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
-	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
-	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
-
-	fmt.Fprintf(&b, "Raw Files\n")
-	fmt.Fprintf(&b, "---------\n")
-	fmt.Fprintf(&b, "- result.json\n")
-	fmt.Fprintf(&b, "- report.txt\n")
-	fmt.Fprintf(&b, "- summary.txt\n")
-	fmt.Fprintf(&b, "- verbose.log\n")
-	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
-	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
-	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
-	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
-	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
+	// ── Raw files ─────────────────────────────────────────────────────────────
+	b.WriteString("## Raw Files\n\n")
+	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
+	b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
+	b.WriteString("- `gpu-*-warmup.log`\n")
+	b.WriteString("- `gpu-*-steady.log`\n")
+	b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
+	b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
 	if result.Interconnect != nil {
-		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
+		b.WriteString("- `nccl-all-reduce.log`\n")
 	}
 	return b.String()
 }

+// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
+// cooldown charts are not useful for human review).
 func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
-	phases := []struct {
-		name  string
-		label string
-	}{
-		{name: "baseline", label: "Baseline"},
-		{name: "steady", label: "Steady State"},
-		{name: "cooldown", label: "Cooldown"},
-	}
 	var charts []benchmarkReportChart
 	for _, idx := range gpuIndices {
-		for _, phase := range phases {
-			path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
-			raw, err := os.ReadFile(path)
-			if err != nil || len(raw) == 0 {
-				continue
-			}
-			charts = append(charts, benchmarkReportChart{
-				Title:   fmt.Sprintf("GPU %d %s", idx, phase.label),
-				Content: string(raw),
-			})
+		path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
+		raw, err := os.ReadFile(path)
+		if err != nil || len(raw) == 0 {
+			continue
 		}
+		charts = append(charts, benchmarkReportChart{
+			Title:   fmt.Sprintf("GPU %d — Steady State", idx),
+			Content: string(raw),
+		})
 	}
 	return charts
 }
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -137,8 +137,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 	for _, needle := range []string{
 		"Executive Summary",
 		"GPU 0 spent measurable time under SW power cap.",
-		"Composite score: 1176.00",
-		"fp16_tensor: 700.00 TOPS",
+		"1176.00",
+		"fp16_tensor",
+		"700.00",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
@@ -164,7 +165,7 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
 	})

 	for _, needle := range []string{
-		"Terminal Charts",
+		"Steady-State Charts",
 		"GPU 0 Steady State",
 		"GPU 0 chart",
 		"42┤───",
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 }

 const (
-	ansiRed    = "\033[31m"
-	ansiBlue   = "\033[34m"
-	ansiGreen  = "\033[32m"
-	ansiYellow = "\033[33m"
+	ansiAmber  = "\033[38;5;214m"
 	ansiReset  = "\033[0m"
 )

@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
 		fn      func(GPUMetricRow) float64
 	}
 	defs := []seriesDef{
-		{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
-		{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
-		{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
-		{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
+		{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
+		{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
+		{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
+		{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
 	}

 	var b strings.Builder
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"--seconds", strconv.Itoa(opts.DurationSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
+		if opts.StaggerSeconds > 0 && len(selected) > 1 {
+			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
+		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"bee-john-gpu-stress",
 			"--seconds", strconv.Itoa(opts.DurationSec),
 		}
+		if opts.StaggerSeconds > 0 && len(selected) > 1 {
+			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
+		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -384,25 +384,39 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 	), logFunc)
 }

-func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
-	profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
-	if err != nil {
-		return "", err
+	var (
+		profCmd []string
+		profEnv []string
+	)
+	if staggerSec > 0 && len(selected) > 1 {
+		profCmd = []string{
+			"bee-dcgmproftester-staggered",
+			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
+			"--stagger-seconds", strconv.Itoa(staggerSec),
+			"--devices", joinIndexList(selected),
+		}
+	} else {
+		profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
+		if err != nil {
+			return "", err
+		}
+		profEnv = nvidiaVisibleDevicesEnv(selected)
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
-		satJob{
-			name:       "03-dcgmproftester.log",
-			cmd:        profCmd,
-			env:        nvidiaVisibleDevicesEnv(selected),
-			collectGPU: true,
-			gpuIndices: selected,
-		},
+			satJob{
+				name:       "03-dcgmproftester.log",
+				cmd:        profCmd,
+				env:        profEnv,
+				collectGPU: true,
+				gpuIndices: selected,
+			},
 		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
@@ -531,9 +545,13 @@ func memoryStressSizeArg() string {
 	return fmt.Sprintf("%dM", targetMB)
 }

-func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-	sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
-	passes := envInt("BEE_MEMTESTER_PASSES", 1)
+func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
+	if sizeMB <= 0 {
+		sizeMB = 256
+	}
+	if passes <= 0 {
+		passes = 1
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
@@ -590,7 +608,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
 	}, logFunc)
 }

-func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if baseDir == "" {
 		baseDir = "/var/log/bee-sat"
 	}
@@ -622,7 +640,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
 			break
 		}
 		prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
-		commands := storageSATCommands(devPath)
+		commands := storageSATCommands(devPath, extended)
 		for cmdIndex, job := range commands {
 			if ctx.Err() != nil {
 				break
@@ -1086,17 +1104,25 @@ func listStorageDevices() ([]string, error) {
 	return parseStorageDevices(string(out)), nil
 }

-func storageSATCommands(devPath string) []satJob {
+func storageSATCommands(devPath string, extended bool) []satJob {
 	if strings.Contains(filepath.Base(devPath), "nvme") {
+		selfTestLevel := "1"
+		if extended {
+			selfTestLevel = "2"
+		}
 		return []satJob{
 			{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
 			{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
-			{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
+			{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
 		}
 	}
+	smartTestType := "short"
+	if extended {
+		smartTestType = "long"
+	}
 	return []satJob{
 		{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
-		{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
+		{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
 	}
 }

--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -20,7 +20,7 @@ type FanStressOptions struct {
 	Phase1DurSec int   // first load phase duration in seconds (default 300)
 	PauseSec     int   // pause between the two load phases (default 60)
 	Phase2DurSec int   // second load phase duration in seconds (default 300)
-	SizeMB       int   // GPU memory to allocate per GPU during stress (default 64)
+	SizeMB       int   // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
 	GPUIndices   []int // which GPU indices to stress (empty = all detected)
 }

@@ -243,9 +243,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
 	if opts.Phase2DurSec <= 0 {
 		opts.Phase2DurSec = 300
 	}
-	if opts.SizeMB <= 0 {
-		opts.SizeMB = 64
-	}
+	// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
+	// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
 }

 // sampleFanStressRow collects all metrics for one telemetry sample.
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -14,12 +14,12 @@ import (
 func TestStorageSATCommands(t *testing.T) {
 	t.Parallel()

-	nvme := storageSATCommands("/dev/nvme0n1")
+	nvme := storageSATCommands("/dev/nvme0n1", false)
 	if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
 		t.Fatalf("unexpected nvme commands: %#v", nvme)
 	}

-	sata := storageSATCommands("/dev/sda")
+	sata := storageSATCommands("/dev/sda", false)
 	if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
 		t.Fatalf("unexpected sata commands: %#v", sata)
 	}
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
 	Loader            string
 	GPUIndices        []int
 	ExcludeGPUIndices []int
+	StaggerSeconds    int
 }

 func New() *System {
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -222,7 +222,21 @@ func formatSplitTaskName(baseName, selectionLabel string) string {
 }

 func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
-	if !shouldSplitHomogeneousNvidiaTarget(target) {
+	if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs {
+		// Parallel mode (or non-splittable target): one task for all selected GPUs.
+		if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) {
+			// Resolve the selected GPU indices so ExcludeGPUIndices is applied.
+			gpus, err := apiListNvidiaGPUs(appRef)
+			if err != nil {
+				return nil, err
+			}
+			resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices)
+			if err != nil {
+				return nil, err
+			}
+			params.GPUIndices = resolved
+			params.ExcludeGPUIndices = nil
+		}
 		t := &Task{
 			ID:        newJobID(idPrefix),
 			Name:      baseName,
@@ -262,6 +276,53 @@ func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params
 	return tasks, nil
 }

+// expandSelectedGPUIndices returns the sorted list of selected GPU indices after
+// applying include/exclude filters, without splitting by model.
+func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) {
+	indexed := make(map[int]struct{}, len(gpus))
+	allIndices := make([]int, 0, len(gpus))
+	for _, gpu := range gpus {
+		indexed[gpu.Index] = struct{}{}
+		allIndices = append(allIndices, gpu.Index)
+	}
+	sort.Ints(allIndices)
+
+	selected := allIndices
+	if len(include) > 0 {
+		selected = make([]int, 0, len(include))
+		seen := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			if _, ok := indexed[idx]; !ok {
+				continue
+			}
+			if _, dup := seen[idx]; dup {
+				continue
+			}
+			seen[idx] = struct{}{}
+			selected = append(selected, idx)
+		}
+		sort.Ints(selected)
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected")
+	}
+	return selected, nil
+}
+
 // ── SSE helpers ───────────────────────────────────────────────────────────────

 func sseWrite(w http.ResponseWriter, event, data string) bool {
@@ -421,12 +482,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 			return
 		}

-		var body struct {
-			Duration           int      `json:"duration"`
-			DiagLevel          int      `json:"diag_level"`
-			GPUIndices         []int    `json:"gpu_indices"`
-			ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
-			Loader             string   `json:"loader"`
+			var body struct {
+				Duration           int      `json:"duration"`
+				StressMode         bool     `json:"stress_mode"`
+				GPUIndices         []int    `json:"gpu_indices"`
+				ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
+				StaggerGPUStart    bool     `json:"stagger_gpu_start"`
+				Loader             string   `json:"loader"`
 			Profile            string   `json:"profile"`
 			DisplayName        string   `json:"display_name"`
 			PlatformComponents []string `json:"platform_components"`
@@ -442,12 +504,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		if strings.TrimSpace(body.DisplayName) != "" {
 			name = body.DisplayName
 		}
-		params := taskParams{
-			Duration:           body.Duration,
-			DiagLevel:          body.DiagLevel,
-			GPUIndices:         body.GPUIndices,
-			ExcludeGPUIndices:  body.ExcludeGPUIndices,
-			Loader:             body.Loader,
+			params := taskParams{
+				Duration:           body.Duration,
+				StressMode:         body.StressMode,
+				GPUIndices:         body.GPUIndices,
+				ExcludeGPUIndices:  body.ExcludeGPUIndices,
+				StaggerGPUStart:    body.StaggerGPUStart,
+				Loader:             body.Loader,
 			BurnProfile:        body.Profile,
 			DisplayName:        body.DisplayName,
 			PlatformComponents: body.PlatformComponents,
@@ -1315,107 +1378,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
 	return nil
 }

-// ── Display / Screen Resolution ───────────────────────────────────────────────
-
-type displayMode struct {
-	Output  string `json:"output"`
-	Mode    string `json:"mode"`
-	Current bool   `json:"current"`
-}
-
-type displayInfo struct {
-	Output  string        `json:"output"`
-	Modes   []displayMode `json:"modes"`
-	Current string        `json:"current"`
-}
-
-var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
-var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
-var xrandrCurrentRE = regexp.MustCompile(`\*`)
-
-func parseXrandrOutput(out string) []displayInfo {
-	var infos []displayInfo
-	var cur *displayInfo
-	for _, line := range strings.Split(out, "\n") {
-		if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
-			if cur != nil {
-				infos = append(infos, *cur)
-			}
-			cur = &displayInfo{Output: m[1]}
-			continue
-		}
-		if cur == nil {
-			continue
-		}
-		if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
-			isCurrent := xrandrCurrentRE.MatchString(line)
-			mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
-			cur.Modes = append(cur.Modes, mode)
-			if isCurrent {
-				cur.Current = m[1]
-			}
-		}
-	}
-	if cur != nil {
-		infos = append(infos, *cur)
-	}
-	return infos
-}
-
-func xrandrCommand(args ...string) *exec.Cmd {
-	cmd := exec.Command("xrandr", args...)
-	env := append([]string{}, os.Environ()...)
-	hasDisplay := false
-	hasXAuthority := false
-	for _, kv := range env {
-		if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
-			hasDisplay = true
-		}
-		if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
-			hasXAuthority = true
-		}
-	}
-	if !hasDisplay {
-		env = append(env, "DISPLAY=:0")
-	}
-	if !hasXAuthority {
-		env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
-	}
-	cmd.Env = env
-	return cmd
-}
-
-func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
-	out, err := xrandrCommand().Output()
-	if err != nil {
-		writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
-		return
-	}
-	writeJSON(w, parseXrandrOutput(string(out)))
-}
-
-func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
-	var req struct {
-		Output string `json:"output"`
-		Mode   string `json:"mode"`
-	}
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
-		writeError(w, http.StatusBadRequest, "output and mode are required")
-		return
-	}
-	// Validate mode looks like WxH to prevent injection
-	if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
-		writeError(w, http.StatusBadRequest, "invalid mode format")
-		return
-	}
-	// Validate output name (no special chars)
-	if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
-		writeError(w, http.StatusBadRequest, "invalid output name")
-		return
-	}
-	if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
-		writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
-		return
-	}
-	writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
-}
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -10,30 +10,6 @@ import (
 	"bee/audit/internal/platform"
 )

-func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
-	t.Setenv("DISPLAY", "")
-	t.Setenv("XAUTHORITY", "")
-
-	cmd := xrandrCommand("--query")
-
-	var hasDisplay bool
-	var hasXAuthority bool
-	for _, kv := range cmd.Env {
-		if kv == "DISPLAY=:0" {
-			hasDisplay = true
-		}
-		if kv == "XAUTHORITY=/home/bee/.Xauthority" {
-			hasXAuthority = true
-		}
-	}
-	if !hasDisplay {
-		t.Fatalf("DISPLAY not injected: %v", cmd.Env)
-	}
-	if !hasXAuthority {
-		t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
-	}
-}
-
 func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1031,42 +1031,45 @@ func renderValidate(opts HandlerOptions) string {
 	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

-<div class="card" style="margin-bottom:16px">
-  <div class="card-head">Validate Profile</div>
-  <div class="card-body validate-profile-body">
-    <div class="validate-profile-col">
-      <div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
-      <div class="form-row" style="margin:12px 0 0"><label>Diag level</label><select id="sat-profile-nvidia-level" style="width:100%"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>
-    </div>
-    <div class="validate-profile-col validate-profile-action">
-      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count. NVIDIA <code>dcgmi diag</code> uses the selected diag level from this profile.</p>
-      <button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
-    </div>
-    <div class="validate-profile-col"></div>
-  </div>
-  <div class="card-body" style="padding-top:0;display:flex;justify-content:center">
-    <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
-  </div>
-</div>
+	<div class="card" style="margin-bottom:16px">
+	  <div class="card-head">Validate Profile</div>
+	  <div class="card-body validate-profile-body">
+	    <div class="validate-profile-col">
+	      <div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
+	    </div>
+	    <div class="validate-profile-col">
+	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
+	    </div>
+	    <div class="validate-profile-col validate-profile-action">
+	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
+	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
+	      <div style="margin-top:12px">
+	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
+	      </div>
+	    </div>
+	  </div>
+	</div>

 <div class="grid3">
 ` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
 		inv.CPU,
 		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
 		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
-		`Duration is taken from Validate Profile diag level: Level 1 = 60s, Level 2 = 5m, Level 3 = 1h, Level 4 = 1h.`,
+		`60s in Validate, 30 min in Stress.`,
 	)) +
 		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
 			inv.Memory,
-			`Runs a short RAM validation pass and records memory state around the test.`,
+			`Runs a RAM validation pass and records memory state around the test.`,
 			`<code>free</code>, <code>memtester</code>`,
-			`No extra settings.`,
+			`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
 		)) +
 		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
 			inv.Storage,
 			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
 			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
-			`No extra settings.`,
+			`Short self-test in Validate, extended self-test in Stress.`,
 		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
@@ -1083,6 +1086,12 @@ func renderValidate(opts HandlerOptions) string {
      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
    </div>
    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
+    <div style="margin-top:10px;padding-top:10px;border-top:1px solid var(--border)">
+      <label class="sat-gpu-row" title="When checked, multi-GPU tests (PSU Pulse, NCCL, NVBandwidth) run on ALL GPUs in the system regardless of the selection above.">
+        <input type="checkbox" id="sat-multi-gpu-all" checked onchange="satUpdateGPUSelectionNote()">
+        <span><strong>Multi-GPU tests</strong> — use all GPUs <span style="font-size:11px;color:var(--muted)">(PSU Pulse, NCCL, NVBandwidth)</span></span>
+      </label>
+    </div>
  </div>
 </div>

@@ -1091,14 +1100,48 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Runs NVIDIA diagnostics and board inventory checks.`,
 			`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-			`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
+			`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
 		)) +
+		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
-			`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
+			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
+			`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-targeted-power">` +
+		renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
+			`<code>dcgmi diag targeted_power</code>`,
+			`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-pulse">` +
+		renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
+			`<code>dcgmi diag pulse_test</code>`,
+			`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-interconnect">` +
+		renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
+			`<code>all_reduce_perf</code> (NCCL tests)`,
+			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-bandwidth">` +
+		renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
+			`<code>nvbandwidth</code>`,
+			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
 		`</div>
 <div class="grid3" style="margin-top:16px">
 ` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
@@ -1125,17 +1168,28 @@ func renderValidate(opts HandlerOptions) string {
 </style>
 <script>
 let satES = null;
-function satDiagLevel() {
-  return parseInt(document.getElementById('sat-profile-nvidia-level').value) || 1;
+function satStressMode() {
+  return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
 }
-function satCPUDurationFromDiagLevel() {
-  const level = satDiagLevel();
-  if (level === 1) return 60;
-  if (level === 2) return 5 * 60;
-  return 60 * 60;
+function satModeChanged() {
+  const stress = satStressMode();
+  [
+    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
+    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
+    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
+    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
+    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
+  ].forEach(function(item) {
+    const card = document.getElementById(item.card);
+    if (card) {
+      card.style.opacity = stress ? '1' : '0.5';
+      const hint = document.getElementById(item.hint);
+      if (hint) hint.style.display = stress ? 'none' : '';
+    }
+  });
 }
 function satLabels() {
-  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
@@ -1156,6 +1210,10 @@ function satSelectedGPUIndices() {
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
+function satMultiGPUAll() {
+  const cb = document.getElementById('sat-multi-gpu-all');
+  return cb ? cb.checked : true;
+}
 function satUpdateGPUSelectionNote() {
  const note = document.getElementById('sat-gpu-selection-note');
  if (!note) return;
@@ -1164,7 +1222,8 @@ function satUpdateGPUSelectionNote() {
    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
    return;
  }
-  note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
+  const multiAll = satMultiGPUAll();
+  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests: ' + (multiAll ? 'all GPUs in system' : 'selected GPUs only') + '.';
 }
 function satRenderGPUList(gpus) {
  const root = document.getElementById('sat-gpu-list');
@@ -1211,9 +1270,8 @@ function satRequestBody(target, overrides) {
  const body = {};
  const labels = satLabels();
  body.display_name = labels[target] || ('Validate ' + target);
-  if (target === 'nvidia') body.diag_level = satDiagLevel();
-  if (target === 'nvidia-targeted-stress') body.duration = 300;
-  if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
+  body.stress_mode = satStressMode();
+  if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
  if (overrides) {
    Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
  }
@@ -1275,8 +1333,28 @@ function runSATWithOverrides(target, overrides) {
  return enqueueSATTarget(target, overrides)
    .then(d => streamSATTask(d.task_id, title, false));
 }
+const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
+// pulse_test and fabric tests run on all selected GPUs simultaneously
+const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+function satAllGPUIndicesForMulti() {
+  // If "Multi-GPU tests — all GPUs" is checked, return all detected GPUs.
+  // Otherwise fall back to the per-GPU selection.
+  if (satMultiGPUAll()) {
+    return loadSatNvidiaGPUs().then(function(gpus) {
+      return gpus.map(function(g) { return Number(g.index); });
+    });
+  }
+  const sel = satSelectedGPUIndices();
+  return Promise.resolve(sel);
+}
 function expandSATTarget(target) {
-  if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
+  if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
+    return satAllGPUIndicesForMulti().then(function(indices) {
+      if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
+      return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
+    });
+  }
+  if (nvidiaPerGPUTargets.indexOf(target) < 0) {
    return Promise.resolve([{target: target}]);
  }
  const selected = satSelectedGPUIndices();
@@ -1292,6 +1370,12 @@ function expandSATTarget(target) {
    label: satGPUDisplayName(gpu)
  })));
 }
+function runNvidiaFabricValidate(target) {
+  satAllGPUIndicesForMulti().then(function(indices) {
+    if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
+    runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
+  });
+}
 function runNvidiaValidateSet(target) {
  return loadSatNvidiaGPUs().then(gpus => {
    const selected = satSelectedGPUIndices();
@@ -1354,8 +1438,10 @@ function runAllSAT() {
  const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const baseTargets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
+  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
+    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
    const btn = document.getElementById('sat-btn-' + target);
    return !(btn && btn.disabled);
  });
@@ -1390,6 +1476,10 @@ function runAllSAT() {
 fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
    if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
 });
@@ -1583,10 +1673,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
 // ── Benchmark ─────────────────────────────────────────────────────────────────

 type benchmarkHistoryColumn struct {
-	key   string
-	label string
-	name  string
-	index int
+	key      string
+	label    string
+	name     string
+	index    int
+	parallel bool
 }

 type benchmarkHistoryCell struct {
@@ -1894,29 +1985,43 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 			cells:       make(map[string]benchmarkHistoryCell),
 		}

-		// Count how many GPUs of each model appear in this run (for the label).
-		gpuModelCount := make(map[string]int)
-		for _, gpu := range result.GPUs {
-			gpuModelCount[strings.TrimSpace(gpu.Name)]++
-		}
-
-		// Track best composite score per column key within this run.
-		runBest := make(map[string]float64)
-		for _, gpu := range result.GPUs {
-			key := benchmarkHistoryColumnKey(result.ServerModel, gpu.Name)
-			count := gpuModelCount[strings.TrimSpace(gpu.Name)]
-			columnByKey[key] = benchmarkHistoryColumn{
-				key:   key,
-				label: benchmarkHistoryColumnLabel(result.ServerModel, gpu.Name, count),
-				name:  strings.TrimSpace(gpu.Name),
-				index: gpu.Index,
+		if result.ParallelGPUs {
+			// All GPUs ran simultaneously — one column per server, score = avg composite.
+			gpuModelCount := make(map[string]int)
+			for _, gpu := range result.GPUs {
+				gpuModelCount[strings.TrimSpace(gpu.Name)]++
 			}
-			if gpu.Scores.CompositeScore > runBest[key] {
-				runBest[key] = gpu.Scores.CompositeScore
+			scoreSum := make(map[string]float64)
+			scoreCnt := make(map[string]int)
+			for _, gpu := range result.GPUs {
+				key := "parallel|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name)
+				scoreSum[key] += gpu.Scores.CompositeScore
+				scoreCnt[key]++
+				count := gpuModelCount[strings.TrimSpace(gpu.Name)]
+				columnByKey[key] = benchmarkHistoryColumn{
+					key:      key,
+					label:    benchmarkHistoryParallelLabel(result.ServerModel, gpu.Name, count),
+					name:     strings.TrimSpace(gpu.Name),
+					index:    -1,
+					parallel: true,
+				}
+			}
+			for key, sum := range scoreSum {
+				run.cells[key] = benchmarkHistoryCell{score: sum / float64(scoreCnt[key]), present: true}
+			}
+		} else {
+			// Each GPU ran independently — one column per GPU index.
+			for _, gpu := range result.GPUs {
+				key := "gpu|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name) + "|" + strconv.Itoa(gpu.Index)
+				columnByKey[key] = benchmarkHistoryColumn{
+					key:      key,
+					label:    benchmarkHistoryPerGPULabel(gpu.Name, gpu.Index),
+					name:     strings.TrimSpace(gpu.Name),
+					index:    gpu.Index,
+					parallel: false,
+				}
+				run.cells[key] = benchmarkHistoryCell{score: gpu.Scores.CompositeScore, present: true}
 			}
-		}
-		for key, score := range runBest {
-			run.cells[key] = benchmarkHistoryCell{score: score, present: true}
 		}
 		runs = append(runs, run)
 	}
@@ -1925,13 +2030,24 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 	for _, col := range columnByKey {
 		columns = append(columns, col)
 	}
+	// Sequential GPU columns first (sorted by GPU index), then parallel server columns.
 	sort.Slice(columns, func(i, j int) bool {
-		li := strings.ToLower(columns[i].label)
-		lj := strings.ToLower(columns[j].label)
-		if li != lj {
-			return li < lj
+		if columns[i].parallel != columns[j].parallel {
+			return !columns[i].parallel // sequential first
 		}
-		return columns[i].key < columns[j].key
+		if columns[i].parallel {
+			li := strings.ToLower(columns[i].label)
+			lj := strings.ToLower(columns[j].label)
+			if li != lj {
+				return li < lj
+			}
+			return columns[i].key < columns[j].key
+		}
+		// Sequential: sort by GPU index, then name.
+		if columns[i].index != columns[j].index {
+			return columns[i].index < columns[j].index
+		}
+		return strings.ToLower(columns[i].name) < strings.ToLower(columns[j].name)
 	})
 	sort.Slice(runs, func(i, j int) bool {
 		return runs[i].generatedAt.After(runs[j].generatedAt)
@@ -1939,32 +2055,35 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 	return columns, runs
 }

-// benchmarkHistoryColumnKey groups results by server model + GPU model so that
-// runs on the same hardware produce one column regardless of individual GPU index.
-func benchmarkHistoryColumnKey(serverModel, gpuName string) string {
-	return strings.TrimSpace(serverModel) + "|" + strings.TrimSpace(gpuName)
+// benchmarkHistoryPerGPULabel formats a label for a single-GPU column: "GPU #N — ModelName".
+func benchmarkHistoryPerGPULabel(gpuName string, index int) string {
+	gpuName = strings.TrimSpace(gpuName)
+	if gpuName == "" {
+		gpuName = "Unknown GPU"
+	}
+	return fmt.Sprintf("GPU #%d — %s", index, gpuName)
 }

-// benchmarkHistoryColumnLabel formats the column header as
-// "Server Model (N× GPU Model)" or "GPU Model" when server info is missing.
-func benchmarkHistoryColumnLabel(serverModel, gpuName string, count int) string {
+// benchmarkHistoryParallelLabel formats a label for an all-GPU parallel column:
+// "ServerModel — N× ModelName (All GPUs)" or "N× ModelName (All GPUs)" if no server.
+func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) string {
 	serverModel = strings.TrimSpace(serverModel)
 	gpuName = strings.TrimSpace(gpuName)
 	if gpuName == "" {
 		gpuName = "Unknown GPU"
 	}
-	gpuPart := fmt.Sprintf("%d× %s", count, gpuName)
+	gpuPart := fmt.Sprintf("%d× %s (All GPUs)", count, gpuName)
 	if serverModel == "" {
 		return gpuPart
 	}
-	return fmt.Sprintf("%s (%s)", serverModel, gpuPart)
+	return fmt.Sprintf("%s — %s", serverModel, gpuPart)
 }

 // ── Burn ──────────────────────────────────────────────────────────────────────

 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics and ` + "targeted_stress" + ` remain in <a href="/validate">Validate</a>. Burn exposes official NVIDIA load recipes by test goal plus separate custom stress tools.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

 <div class="card" style="margin-bottom:16px">
@@ -1977,11 +2096,11 @@ func renderBurn() string {
      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
    </div>
    <div class="burn-profile-col burn-profile-action">
-      <button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
+      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
      <p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
    </div>
    <div class="burn-profile-col burn-profile-action">
-      <button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
+      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
      <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
    </div>
  </div>
@@ -1998,12 +2117,16 @@ func renderBurn() string {
      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
    </div>
-    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
-      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
-    </div>
-    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
-  </div>
-</div>
+	    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+	      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+	    </div>
+	    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
+	    <label class="cb-row" style="margin-top:10px">
+	      <input type="checkbox" id="burn-stagger-nvidia">
+	      <span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span>
+	    </label>
+	  </div>
+	</div>

 <div class="burn-section">Core Burn Paths</div>
 <div class="grid2 burn-grid" style="margin-bottom:16px">
@@ -2029,27 +2152,6 @@ func renderBurn() string {
 </div>
 </div>

-<div class="burn-section">GPU-Specific Tests</div>
-<div class="grid2 burn-grid" style="margin-bottom:16px">
-<div class="card burn-card">
-  <div class="card-head card-head-actions"><span>Power Delivery / Power Budget</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true}])">Run</button></div>
-  <div class="card-body burn-card-body">
-    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA power-oriented recipes. ` + "targeted_power" + ` checks sustained delivery; ` + "pulse_test" + ` checks transient behavior.</p>
-    <label class="cb-row"><input type="checkbox" id="burn-nvidia-power" disabled><span>NVIDIA Targeted Power (dcgmi diag targeted_power) <span class="cb-note" id="note-nvidia-power"></span></span></label>
-    <label class="cb-row"><input type="checkbox" id="burn-nvidia-pulse" disabled><span>NVIDIA Pulse Test (dcgmi diag pulse_test) <span class="cb-note" id="note-nvidia-pulse"></span></span></label>
-  </div>
-</div>
-
-<div class="card burn-card">
-  <div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
-  <div class="card-body burn-card-body">
-    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA fabric paths. NCCL is interconnect-only and is not a compute burn. NVBandwidth validates copy and bandwidth paths.</p>
-    <label class="cb-row"><input type="checkbox" id="burn-nvidia-interconnect" disabled><span>NVIDIA Interconnect Test (NCCL all_reduce_perf) <span class="cb-note" id="note-nvidia-interconnect"></span></span></label>
-    <label class="cb-row"><input type="checkbox" id="burn-nvidia-bandwidth" disabled><span>NVIDIA Bandwidth Test (NVBandwidth) <span class="cb-note" id="note-nvidia-bandwidth"></span></span></label>
-  </div>
-</div>
-</div>
-
 <div id="bi-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Output <span id="bi-title"></span></div>
  <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
@@ -2098,6 +2200,11 @@ function burnSelectedGPUIndices() {
    .sort(function(a, b) { return a - b; });
 }

+function burnUseNvidiaRampUp() {
+  const el = document.getElementById('burn-stagger-nvidia');
+  return !!(el && el.checked);
+}
+
 function burnUpdateSelectionNote() {
  const note = document.getElementById('burn-selection-note');
  const selected = burnSelectedGPUIndices();
@@ -2157,6 +2264,9 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
      return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
    }
    body.gpu_indices = selected;
+    if (burnUseNvidiaRampUp() && selected.length > 1) {
+      body.stagger_gpu_start = true;
+    }
  }
  return fetch('/api/sat/' + target + '/run', {
    method: 'POST',
@@ -2299,10 +2409,6 @@ function runAllBurnTasks() {
  const status = document.getElementById('burn-all-status');
  const all = [
    {id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
-    {id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},
-    {id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true},
-    {id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
-    {id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
    {id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
    {id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
    {id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
@@ -2317,10 +2423,6 @@ function runAllBurnTasks() {
 fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
  const map = {
    'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
-    'nvidia-targeted-power': {cb:'burn-nvidia-power', note:'note-nvidia-power', reason:'dcgmi not available or NVIDIA driver not running'},
-    'nvidia-pulse': {cb:'burn-nvidia-pulse', note:'note-nvidia-pulse', reason:'dcgmi not available or NVIDIA driver not running'},
-    'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
-    'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
    'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
    'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
    'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
@@ -2756,55 +2858,6 @@ usbRefresh();
 </script>`
 }

-// ── Display Resolution ────────────────────────────────────────────────────────
-
-func renderDisplayInline() string {
-	return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
-<div id="display-controls"></div>
-<script>
-(function(){
-function loadDisplays() {
-  fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
-    const status = document.getElementById('display-status');
-    const ctrl = document.getElementById('display-controls');
-    if (!displays || displays.length === 0) {
-      status.textContent = 'No connected displays found or xrandr not available.';
-      return;
-    }
-    status.textContent = '';
-    ctrl.innerHTML = displays.map(d => {
-      const opts = (d.modes||[]).map(m =>
-        '<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
-      ).join('');
-      return '<div style="margin-bottom:12px">'
-        +'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
-        +'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
-        +'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
-        +'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
-        +'</div>';
-    }).join('');
-  }).catch(()=>{
-    document.getElementById('display-status').textContent = 'xrandr not available on this system.';
-  });
-}
-window.applyResolution = function(output) {
-  const sel = document.getElementById('res-sel-'+output);
-  if (!sel) return;
-  const mode = sel.value;
-  const btn = sel.nextElementSibling;
-  btn.disabled = true;
-  btn.textContent = 'Applying...';
-  fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
-    .then(r=>r.json()).then(d=>{
-      if (d.error) { alert('Error: '+d.error); }
-      loadDisplays();
-    }).catch(e=>{ alert('Error: '+e); })
-    .finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
-};
-loadDisplays();
-})();
-</script>`
-}

 func renderNvidiaSelfHealInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
@@ -2993,8 +3046,6 @@ function installToRAM() {
 <div class="card"><div class="card-head">Services</div><div class="card-body">` +
 		renderServicesInline() + `</div></div>

-<div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
-		renderDisplayInline() + `</div></div>

 <script>
 function checkTools() {
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -295,10 +295,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Tools
 	mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)

-	// Display
-	mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
-	mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
-
 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
 	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -693,8 +693,8 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	for _, needle := range []string{
 		`Benchmark Results`,
 		`Composite score by saved benchmark run and GPU.`,
-		`NVIDIA H100 PCIe / GPU 0`,
-		`NVIDIA H100 PCIe / GPU 1`,
+		`GPU #0 — NVIDIA H100 PCIe`,
+		`GPU #1 — NVIDIA H100 PCIe`,
 		`#1`,
 		wantTime,
 		`1176.25`,
@@ -741,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	for _, needle := range []string{
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
-		`targeted_stress remain in <a href="/validate">Validate</a>`,
-		`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
+		`NCCL`,
+		`Validate → Stress mode`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -115,10 +115,12 @@ type Task struct {
 // taskParams holds optional parameters parsed from the run request.
 type taskParams struct {
 	Duration           int      `json:"duration,omitempty"`
-	DiagLevel          int      `json:"diag_level,omitempty"`
+	StressMode         bool     `json:"stress_mode,omitempty"`
 	GPUIndices         []int    `json:"gpu_indices,omitempty"`
 	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
+	StaggerGPUStart    bool     `json:"stagger_gpu_start,omitempty"`
 	SizeMB             int      `json:"size_mb,omitempty"`
+	Passes             int      `json:"passes,omitempty"`
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
@@ -161,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
 	}
 }

+func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
+	if enabled && len(selected) > 1 {
+		return 180
+	}
+	return 0
+}
+
 func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
 	acceptanceCycles := []platform.PlatformStressCycle{
 		{LoadSec: 85, IdleSec: 5},
@@ -215,11 +224,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
 const maxTaskHistory = 50

 var (
-	runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-		return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
+	runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
+		return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc)
 	}
-	runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-		return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
+	runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
+		return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
 	}
 	runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 		return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
@@ -552,7 +561,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		diagLevel := t.params.DiagLevel
+		diagLevel := 2
+		if t.params.StressMode {
+			diagLevel = 3
+		}
 		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
 			result, e := a.RunNvidiaAcceptancePackWithOptions(
 				ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -588,7 +600,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RunNCCL:           t.params.RunNCCL,
 			ParallelGPUs:      t.params.ParallelGPUs,
 		}, j.append)
-	case "nvidia-compute":
+		case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
@@ -597,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
-		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
+			staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
+			if staggerSec > 0 {
+				j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
+			}
+			archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
 	case "nvidia-targeted-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -647,24 +663,29 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
-		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
-			DurationSec:       dur,
-			Loader:            t.params.Loader,
-			GPUIndices:        t.params.GPUIndices,
-			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
-		}, j.append)
+			archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
+				DurationSec:       dur,
+				Loader:            t.params.Loader,
+				GPUIndices:        t.params.GPUIndices,
+				ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+				StaggerSeconds:    boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
+			}, j.append)
 	case "memory":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
+		sizeMB, passes := 256, 1
+		if t.params.StressMode {
+			sizeMB, passes = 1024, 3
+		}
+		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
+		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
 	case "cpu":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -675,7 +696,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		if dur <= 0 {
-			dur = 60
+			if t.params.StressMode {
+				dur = 1800
+			} else {
+				dur = 60
+			}
 		}
 		j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
 		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -422,7 +422,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	for _, needle := range []string{
 		`Benchmark Results`,
 		`Composite score for this benchmark task.`,
-		`NVIDIA H100 PCIe / GPU 0`,
+		`GPU #0 — NVIDIA H100 PCIe`,
 		`1176.25`,
 	} {
 		if !strings.Contains(html, needle) {
--- a/2
+++ b/2
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -36,7 +36,6 @@ typedef void *CUstream;
 #define MAX_CUBLAS_PROFILES 5
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
-#define STRESS_LAUNCH_DEPTH 8

 static const char *ptx_source =
    ".version 6.0\n"
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
    unsigned long iterations = 0;
    int mp_count = 0;
    int stream_count = 1;
-    int launches_per_wave = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "driver-ptx");
@@ -419,44 +417,42 @@ static int run_ptx_fallback(struct cuda_api *api,

    unsigned int threads = 256;

-    double start = now_seconds();
-    double deadline = start + (double)seconds;
+    double deadline = now_seconds() + (double)seconds;
+    double next_sync = now_seconds() + 1.0;
    while (now_seconds() < deadline) {
-        launches_per_wave = 0;
-        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
-            int launched_this_batch = 0;
-            for (int lane = 0; lane < stream_count; lane++) {
-                unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
-                if (!check_rc(api,
-                              "cuLaunchKernel",
-                              api->cuLaunchKernel(kernel,
-                                                  blocks,
-                                                  1,
-                                                  1,
-                                                  threads,
-                                                  1,
-                                                  1,
-                                                  0,
-                                                  streams[lane],
-                                                  params[lane],
-                                                  NULL))) {
-                    goto fail;
-                }
-                launches_per_wave++;
-                launched_this_batch++;
-            }
-            if (launched_this_batch <= 0) {
-                break;
+        int launched = 0;
+        for (int lane = 0; lane < stream_count; lane++) {
+            unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
+            if (!check_rc(api,
+                          "cuLaunchKernel",
+                          api->cuLaunchKernel(kernel,
+                                              blocks,
+                                              1,
+                                              1,
+                                              threads,
+                                              1,
+                                              1,
+                                              0,
+                                              streams[lane],
+                                              params[lane],
+                                              NULL))) {
+                goto fail;
            }
+            launched++;
+            iterations++;
        }
-        if (launches_per_wave <= 0) {
+        if (launched <= 0) {
            goto fail;
        }
-        if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
-            goto fail;
+        double now = now_seconds();
+        if (now >= next_sync || now >= deadline) {
+            if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
+                goto fail;
+            }
+            next_sync = now + 1.0;
        }
-        iterations += (unsigned long)launches_per_wave;
    }
+    api->cuCtxSynchronize();

    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
        goto fail;
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
    report->iterations = iterations;
    snprintf(report->details,
             sizeof(report->details),
-             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
+             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
             size_mb,
             report->buffer_mb,
             report->stream_count,
-             STRESS_LAUNCH_DEPTH,
             bytes_per_stream[0] / (1024u * 1024u),
             iterations);

@@ -1140,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    int stream_count = 1;
    int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
    int prepared_count = 0;
-    int wave_launches = 0;
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
@@ -1207,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
-                  STRESS_LAUNCH_DEPTH,
                  mp_count,
                  per_profile_budget / (1024u * 1024u));

@@ -1260,50 +1253,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

+    /* Keep the GPU queue continuously full by submitting kernels without
+     * synchronizing after every wave.  A sync barrier after each small batch
+     * creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
+     * especially when individual kernels are short.  Instead we sync at most
+     * once per second (for error detection) and once at the very end. */
    double deadline = now_seconds() + (double)seconds;
+    double next_sync = now_seconds() + 1.0;
    while (now_seconds() < deadline) {
-        wave_launches = 0;
-        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
-            int launched_this_batch = 0;
-            for (int i = 0; i < prepared_count; i++) {
-                if (!prepared[i].ready) {
-                    continue;
-                }
-                if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
-                    append_detail(report->details,
-                                  sizeof(report->details),
-                                  "%s=FAILED runtime\n",
-                                  prepared[i].desc.name);
-                    for (int j = 0; j < prepared_count; j++) {
-                        destroy_profile(&cublas, cuda, &prepared[j]);
-                    }
-                    cublas.cublasLtDestroy(handle);
-                    destroy_streams(cuda, streams, stream_count);
-                    cuda->cuCtxDestroy(ctx);
-                    return 0;
-                }
-                prepared[i].iterations++;
-                report->iterations++;
-                wave_launches++;
-                launched_this_batch++;
+        int launched = 0;
+        for (int i = 0; i < prepared_count; i++) {
+            if (!prepared[i].ready) {
+                continue;
            }
-            if (launched_this_batch <= 0) {
-                break;
+            if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
+                append_detail(report->details,
+                              sizeof(report->details),
+                              "%s=FAILED runtime\n",
+                              prepared[i].desc.name);
+                for (int j = 0; j < prepared_count; j++) {
+                    destroy_profile(&cublas, cuda, &prepared[j]);
+                }
+                cublas.cublasLtDestroy(handle);
+                destroy_streams(cuda, streams, stream_count);
+                cuda->cuCtxDestroy(ctx);
+                return 0;
            }
+            prepared[i].iterations++;
+            report->iterations++;
+            launched++;
        }
-        if (wave_launches <= 0) {
+        if (launched <= 0) {
            break;
        }
-        if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
-            for (int i = 0; i < prepared_count; i++) {
-                destroy_profile(&cublas, cuda, &prepared[i]);
+        double now = now_seconds();
+        if (now >= next_sync || now >= deadline) {
+            if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
+                for (int i = 0; i < prepared_count; i++) {
+                    destroy_profile(&cublas, cuda, &prepared[i]);
+                }
+                cublas.cublasLtDestroy(handle);
+                destroy_streams(cuda, streams, stream_count);
+                cuda->cuCtxDestroy(ctx);
+                return 0;
            }
-            cublas.cublasLtDestroy(handle);
-            destroy_streams(cuda, streams, stream_count);
-            cuda->cuCtxDestroy(ctx);
-            return 0;
+            next_sync = now + 1.0;
        }
    }
+    /* Final drain — ensure all queued work finishes before we read results. */
+    cuda->cuCtxSynchronize();

    for (int i = 0; i < prepared_count; i++) {
        if (!prepared[i].ready) {
--- a/iso/builder/config/bootloaders/grub-pc/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/theme.cfg
@@ -1,9 +1,9 @@
 set color_normal=light-gray/black
-set color_highlight=white/dark-gray
+set color_highlight=yellow/black

 if [ -e /boot/grub/splash.png ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
-    set menu_color_normal=cyan/black
-    set menu_color_highlight=white/dark-gray
+    set menu_color_normal=yellow/black
+    set menu_color_highlight=white/brown
 fi
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -10,20 +10,15 @@ import os

 W, H = 1920, 1080

-GLYPHS = {
-    'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"],
-    'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"],
-    'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"],
-    'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"],
-    'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"],
-    '-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"],
-}
-
-TITLE = "EASY-BEE"
-SUBTITLE = "Hardware Audit LiveCD"
-CELL = 30
-GLYPH_GAP = 18
-ROW_GAP = 6
+ASCII_ART = [
+    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
+    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
+    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
+    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
+    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
+    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
+]
+SUBTITLE = "  Hardware Audit LiveCD"

 FG = (0xF6, 0xD0, 0x47)
 FG_DIM = (0xD4, 0xA9, 0x1C)
@@ -31,6 +26,12 @@ SHADOW = (0x5E, 0x47, 0x05)
 SUB = (0x96, 0x7A, 0x17)
 BG = (0x05, 0x05, 0x05)

+MONO_FONT_CANDIDATES = [
+    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
+    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
+    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
+]
 SUB_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
@@ -39,43 +40,34 @@ SUB_FONT_CANDIDATES = [
 ]


-def load_font(size):
-    for path in SUB_FONT_CANDIDATES:
+def load_font(candidates, size):
+    for path in candidates:
        if os.path.exists(path):
            return ImageFont.truetype(path, size)
    return ImageFont.load_default()


-def glyph_width(ch):
-    return len(GLYPHS[ch][0])
+def mono_metrics(font):
+    probe = Image.new('L', (W, H), 0)
+    draw = ImageDraw.Draw(probe)
+    char_w = int(round(draw.textlength("M", font=font)))
+    bb = draw.textbbox((0, 0), "Mg", font=font)
+    char_h = bb[3] - bb[1]
+    return char_w, char_h


-def render_logo_mask():
-    width_cells = 0
-    for idx, ch in enumerate(TITLE):
-        width_cells += glyph_width(ch)
-        if idx != len(TITLE) - 1:
-            width_cells += 1
-    mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
-    mask_h = 7 * CELL + 6 * ROW_GAP
-    mask = Image.new('L', (mask_w, mask_h), 0)
+def render_ascii_mask(font, lines, char_w, char_h, line_gap):
+    width = max(len(line) for line in lines) * char_w
+    height = len(lines) * char_h + line_gap * (len(lines) - 1)
+    mask = Image.new('L', (width, height), 0)
    draw = ImageDraw.Draw(mask)
-
-    cx = 0
-    for idx, ch in enumerate(TITLE):
-        glyph = GLYPHS[ch]
-        for row_idx, row in enumerate(glyph):
-            for col_idx, cell in enumerate(row):
-                if cell != '1':
-                    continue
-                x0 = cx + col_idx * CELL
-                y0 = row_idx * (CELL + ROW_GAP)
-                x1 = x0 + CELL - 4
-                y1 = y0 + CELL - 4
-                draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
-        cx += glyph_width(ch) * CELL
-        if idx != len(TITLE) - 1:
-            cx += CELL + GLYPH_GAP
+    for row, line in enumerate(lines):
+        y = row * (char_h + line_gap)
+        for col, ch in enumerate(line):
+            if ch == ' ':
+                continue
+            x = col * char_w
+            draw.text((x, y), ch, font=font, fill=255)
    return mask


@@ -90,20 +82,28 @@ glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
 glow = glow.filter(ImageFilter.GaussianBlur(60))
 img = Image.alpha_composite(img.convert('RGBA'), glow)

-logo_mask = render_logo_mask()
+TARGET_LOGO_W = 400
+max_chars = max(len(line) for line in ASCII_ART)
+_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
+_probe_cw, _ = mono_metrics(_probe_font)
+font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
+font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
+char_w, char_h = mono_metrics(font_logo)
+logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
 logo_w, logo_h = logo_mask.size
 logo_x = (W - logo_w) // 2
-logo_y = 290
+logo_y = 380

-shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
-img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
-img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
+sh_off = max(1, font_size_logo // 6)
+shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
+img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
+img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
 img.paste(FG, (logo_x, logo_y), logo_mask)

-font_sub = load_font(30)
+font_sub = load_font(SUB_FONT_CANDIDATES, 30)
 sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
 sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
-sub_y = logo_y + logo_h + 54
+sub_y = logo_y + logo_h + 48
 draw = ImageDraw.Draw(img)
 draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
 draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
--- a/iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
+++ b/iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
@@ -0,0 +1,110 @@
+#!/bin/sh
+set -eu
+
+SECONDS=300
+STAGGER_SECONDS=180
+DEVICES=""
+EXCLUDE=""
+
+usage() {
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
+    exit 2
+}
+
+normalize_list() {
+    echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
+}
+
+contains_csv() {
+    needle="$1"
+    haystack="${2:-}"
+    echo ",${haystack}," | grep -q ",${needle},"
+}
+
+resolve_dcgmproftester() {
+    for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
+        if command -v "${candidate}" >/dev/null 2>&1; then
+            command -v "${candidate}"
+            return 0
+        fi
+    done
+    return 1
+}
+
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
+        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
+        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+
+PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
+ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
+[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
+
+DEVICES=$(normalize_list "${DEVICES}")
+EXCLUDE=$(normalize_list "${EXCLUDE}")
+SELECTED="${DEVICES}"
+if [ -z "${SELECTED}" ]; then
+    SELECTED="${ALL_DEVICES}"
+fi
+
+FINAL=""
+for id in $(echo "${SELECTED}" | tr ',' ' '); do
+    [ -n "${id}" ] || continue
+    if contains_csv "${id}" "${EXCLUDE}"; then
+        continue
+    fi
+    if [ -z "${FINAL}" ]; then
+        FINAL="${id}"
+    else
+        FINAL="${FINAL},${id}"
+    fi
+done
+
+[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
+
+echo "loader=dcgmproftester-staggered"
+echo "selected_gpus=${FINAL}"
+echo "stagger_seconds=${STAGGER_SECONDS}"
+
+TMP_DIR=$(mktemp -d)
+trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
+
+GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
+gpu_pos=0
+WORKERS=""
+for id in $(echo "${FINAL}" | tr ',' ' '); do
+    gpu_pos=$((gpu_pos + 1))
+    log="${TMP_DIR}/gpu-${id}.log"
+    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
+    gpu_seconds=$(( SECONDS + extra_sec ))
+    echo "starting gpu ${id} seconds=${gpu_seconds}"
+    CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
+    pid=$!
+    WORKERS="${WORKERS} ${pid}:${id}:${log}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
+done
+
+status=0
+for spec in ${WORKERS}; do
+    pid=${spec%%:*}
+    rest=${spec#*:}
+    id=${rest%%:*}
+    log=${rest#*:}
+    if wait "${pid}"; then
+        echo "gpu ${id} finished: OK"
+    else
+        rc=$?
+        echo "gpu ${id} finished: FAILED rc=${rc}"
+        status=1
+    fi
+    sed "s/^/[gpu ${id}] /" "${log}" || true
+done
+
+exit "${status}"
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -2,13 +2,14 @@
 set -eu

 SECONDS=5
+STAGGER_SECONDS=0
 SIZE_MB=0
 DEVICES=""
 EXCLUDE=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"

 usage() {
-    echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
    exit 2
 }

@@ -25,6 +26,7 @@ contains_csv() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
@@ -61,14 +63,18 @@ done

 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"
+echo "stagger_seconds=${STAGGER_SECONDS}"

 export CUDA_DEVICE_ORDER="PCI_BUS_ID"

 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM

+GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
+gpu_pos=0
 WORKERS=""
 for id in $(echo "${FINAL}" | tr ',' ' '); do
+    gpu_pos=$((gpu_pos + 1))
    log="${TMP_DIR}/gpu-${id}.log"
    gpu_size_mb="${SIZE_MB}"
    if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
            gpu_size_mb=512
        fi
    fi
-    echo "starting gpu ${id} size=${gpu_size_mb}MB"
+    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
+    gpu_seconds=$(( SECONDS + extra_sec ))
+    echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
    CUDA_VISIBLE_DEVICES="${id}" \
-        "${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
 done

 status=0
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -2,6 +2,7 @@
 set -eu

 DURATION_SEC=300
+STAGGER_SECONDS=0
 DEVICES=""
 EXCLUDE=""
 FORMAT=""
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
 export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"

 usage() {
-    echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
    exit 2
 }

@@ -118,6 +119,7 @@ ensure_opencl_ready() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
+        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -170,6 +172,7 @@ done
 echo "loader=john"
 echo "selected_gpus=${FINAL}"
 echo "john_devices=${JOHN_DEVICES}"
+echo "stagger_seconds=${STAGGER_SECONDS}"

 cd "${JOHN_DIR}"

@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
 echo "format=${CHOSEN_FORMAT}"
 echo "target_seconds=${DURATION_SEC}"
 echo "slice_seconds=${TEST_SLICE_SECONDS}"
-DEADLINE=$(( $(date +%s) + DURATION_SEC ))
+TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
 _first=1
+pos=0
 for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
+    pos=$((pos + 1))
    [ "${_first}" = "1" ] || sleep 3
    _first=0
-    run_john_loop "${opencl_id}" "${DEADLINE}" &
+    extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
+    deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
+    run_john_loop "${opencl_id}" "${deadline}" &
    pid=$!
    PIDS="${PIDS} ${pid}"
+    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
+        sleep "${STAGGER_SECONDS}"
+    fi
 done
 FAIL=0
 for pid in ${PIDS}; do
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {

 log "kernel: $(uname -r)"

-# Skip if no NVIDIA GPU present (PCI vendor 10de)
-if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
+# Skip if no NVIDIA display/compute GPU is present.
+# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
+have_nvidia_gpu() {
+    lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
+}
+
+if ! have_nvidia_gpu; then
    log "no NVIDIA GPU detected — skipping module load"
    exit 0
 fi
--- a/iso/overlay/usr/local/bin/bee-selfheal
+++ b/iso/overlay/usr/local/bin/bee-selfheal
@@ -14,7 +14,7 @@ log() {
 }

 have_nvidia_gpu() {
-    lspci -nn 2>/dev/null | grep -qi '10de:'
+    lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
 }

 service_active() {
Author	SHA1	Message	Date
Mikhail Chusavitin	c1690a084b	Fix app tests that mutate global defaults	2026-04-09 15:28:25 +03:00
Mikhail Chusavitin	9481ca2805	Add staged NVIDIA burn ramp-up mode	2026-04-09 15:21:14 +03:00
Mikhail Chusavitin	a78fdadd88	Refine validate and burn profile layout	2026-04-09 15:14:48 +03:00
Mikhail Chusavitin	4ef403898f	Tighten NVIDIA GPU PCI detection	2026-04-09 15:14:48 +03:00
Michael Chus	025548ab3c	UI: amber accents, smaller wallpaper logo, new support bundle name, drop display resolution - Bootloader: GRUB fallback text colors → yellow/brown (amber tone) - CLI charts: all GPU metric series use single amber color (xterm-256 #214) - Wallpaper: logo width scaled to 400 px dynamically, shadow scales with font size - Support bundle: renamed to YYYY-MM-DD (BEE-SP vX.X) SRV_MODEL SRV_SN ToD.tar.gz using dmidecode for server model (spaces→underscores) and serial number - Remove display resolution feature (UI card, API routes, handlers, tests) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 21:37:01 +03:00
Mikhail Chusavitin	e0d94d7f47	Remove HPL from build and audit flows	2026-04-08 10:00:23 +03:00
Mikhail Chusavitin	13899aa864	Drop incompatible HPL git fallback	2026-04-08 09:50:58 +03:00
Mikhail Chusavitin	f345d8a89d	Build HPL serially to avoid upstream make races	2026-04-08 09:47:35 +03:00
Mikhail Chusavitin	4715059ac0	Fix HPL MPI stub header and keep full build logs	2026-04-08 09:45:14 +03:00
Mikhail Chusavitin	0660a40287	Harden HPL builder cache and runtime libs	2026-04-08 09:40:18 +03:00
Mikhail Chusavitin	67369d9b7b	Fix OpenBLAS package lookup in HPL build	2026-04-08 09:32:49 +03:00
Mikhail Chusavitin	3f41a026ca	Add resilient HPL source fallbacks	2026-04-08 09:25:31 +03:00
Mikhail Chusavitin	0ee4f46537	Restore MOTD-style ASCII wallpaper	2026-04-08 09:14:27 +03:00
Michael Chus	8db40b098a	Update bible submodule Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 07:14:31 +03:00
Michael Chus	16e7ae00e7	Add HPL (LINPACK) benchmark as validate/stress task HPL 2.3 from netlib compiled against OpenBLAS with a minimal single-process MPI stub — no MPI package required in the ISO. Matrix size is auto-sized to 80% of total RAM at runtime. Build: - VERSIONS: HPL_VERSION=2.3, HPL_SHA256=32c5c17d… - build-hpl.sh: downloads HPL + OpenBLAS from Debian 12 repo, compiles xhpl with a self-contained mpi_stub.c - build.sh: step 80-hpl, injects xhpl + libopenblas into overlay Runtime: - bee-hpl: generates HPL.dat (N auto from /proc/meminfo, NB=256, P=1 Q=1), runs xhpl, prints standard WR... Gflops output - platform/hpl.go: RunHPL(), parses WR line → GFlops + PASSED/FAILED - tasks.go: target "hpl" - pages.go: LINPACK (HPL) card in validate/stress grid (stress-only) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 07:08:18 +03:00
Michael Chus	b2f8626fee	Refactor validate modes, fix benchmark report and IPMI power - Replace diag level 1-4 dropdown with Validate/Stress radio buttons - Validate: dcgmi L2, 60s CPU, 256MB/1p memtester, SMART short - Stress: dcgmi L3 + targeted_stress in Run All, 30min CPU, 1GB/3p memtester, SMART long/NVMe extended - Parallel GPU mode: spawn single task for all GPUs instead of splitting per model - Benchmark table: per-GPU columns for sequential runs, server-wide column for parallel - Benchmark report converted to Markdown with server model, GPU model, version in header; only steady-state charts - Fix IPMI power parsing in benchmark (was looking for 'Current Power', correct field is 'Instantaneous power reading') Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:42:12 +03:00
Michael Chus	dd26e03b2d	Add multi-GPU selector option for system-level tests Adds a "Multi-GPU tests — use all GPUs" checkbox to the NVIDIA GPU selector (checked by default). When enabled, PSU Pulse, NCCL, and NVBandwidth tests run on every GPU in the system regardless of the per-GPU selection above — which is required for correct PSU stress testing (synchronous pulses across all GPUs create worst-case transients). When unchecked, only the manually selected GPUs are used. The same logic applies both to Run All (expandSATTarget) and to the individual Run button on each multi-GPU test card. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:25:12 +03:00
Michael Chus	6937a4c6ec	Fix pulse_test: run all GPUs simultaneously, not per-GPU pulse_test is a PSU/power-delivery test, not a per-GPU compute test. Its purpose is to synchronously pulse all GPUs between idle and full load to create worst-case transient spikes on the power supply. Running it one GPU at a time would produce a fraction of the PSU load and miss any PSU-level failures. - Move nvidia-pulse from nvidiaPerGPUTargets to nvidiaAllGPUTargets (same dispatch path as NCCL and NVBandwidth) - Change card onclick to runNvidiaFabricValidate (all selected GPUs at once) - Update card title to "NVIDIA PSU Pulse Test" and description to explain why synchronous multi-GPU execution is required Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:19:11 +03:00
Michael Chus	b9be93c213	Move NCCL interconnect and NVBandwidth tests to validate/stress nvidia-interconnect (NCCL all_reduce_perf) and nvidia-bandwidth (NVBandwidth) verify fabric connectivity and bandwidth — they are not sustained burn loads. Move both from the Burn section to the Validate section under the stress-mode toggle, alongside the other DCGM diagnostic tests moved in the previous commit. - Add sat-card-nvidia-interconnect and sat-card-nvidia-bandwidth validate cards (stress-only, all selected GPUs at once) - Add runNvidiaFabricValidate() for all-GPU-at-once dispatch - Add nvidiaAllGPUTargets handling in expandSATTarget/runAllSAT - Remove Interconnect / Bandwidth card from Burn section - Remove nvidia-interconnect and nvidia-bandwidth from runAllBurnTasks and the gpu/tools availability map Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:16:42 +03:00
Michael Chus	d1a22d782d	Move power diag tests to validate/stress; fix GPU burn power saturation - bee-gpu-stress.c: remove per-wave cuCtxSynchronize barrier in both cuBLASLt and PTX hot loops; sync at most once/sec so the GPU queue stays continuously full — eliminates the CPU↔GPU ping-pong that prevented reaching full TDP - sat_fan_stress.go: default SizeMB 0 (auto = 95% VRAM) instead of hardcoded 64 MB; tiny matrices caused <0.1 ms kernels where CPU re-queue overhead dominated - pages.go: move nvidia-targeted-power and nvidia-pulse from Burn → Validate stress section alongside nvidia-targeted-stress; these are DCGM pass/fail diagnostics, not sustained burn loads; remove the Power Delivery / Power Budget card from Burn entirely Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-08 00:13:52 +03:00