Capture per-run IPMI power and GPU telemetry in power benchmark

- Sample IPMI loaded_w per single-card calibration and per ramp step instead of averaging over the entire Phase 2; top-level ServerPower uses the final (all-GPU) ramp step value - Add ServerLoadedW/ServerDeltaW to NvidiaPowerBenchGPU and NvidiaPowerBenchStep so external tooling can compare wall power per phase without re-parsing logs - Write gpu-metrics.csv/.html inside each single-XX/ and step-XX/ subdir; aggregate all phases into a top-level gpu-metrics.csv/.html - Write 00-nvidia-smi-q.log at the start of every power run - Add Telemetry (p95 temp/power/fan/clock) to NvidiaPowerBenchGPU in result.json from the converged calibration attempt - Power benchmark page: split "Achieved W" into Single-card W and Multi-GPU W (StablePowerLimitW); derate highlight and status color now reflect the final multi-GPU limit vs nominal - Performance benchmark page: add Status column and per-GPU score color coding (green/yellow/red) based on gpu.Status and OverallStatus Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Seed power ramp with single-card TDP limits
2026-04-17 17:59:58 +03:00 · 2026-04-16 11:43:01 +03:00 · 2026-04-16 11:02:30 +03:00 · 2026-04-16 10:54:03 +03:00
10 changed files with 284 additions and 197 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -146,7 +146,7 @@ type satRunner interface {
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 }

 type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
 	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
 }

+func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
+}
+
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -128,6 +128,7 @@ type fakeSAT struct {
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
 	runNvidiaPulseFn          func(string, int, []int) (string, error)
 	runNvidiaBandwidthFn      func(string, []int) (string, error)
+	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }

-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNCCLFn != nil {
+		return f.runNCCLFn(baseDir, gpuIndices)
+	}
 	return "", nil
 }

+func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
+	t.Parallel()
+
+	var gotBaseDir string
+	var gotGPUIndices []int
+	a := &App{
+		sat: fakeSAT{
+			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
+				gotBaseDir = baseDir
+				gotGPUIndices = append([]int(nil), gpuIndices...)
+				return "/tmp/nccl-tests.tar.gz", nil
+			},
+		},
+	}
+
+	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
+	if err != nil {
+		t.Fatalf("RunNCCLTests error: %v", err)
+	}
+	if path != "/tmp/nccl-tests.tar.gz" {
+		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
+	}
+	if gotBaseDir != "/tmp/sat" {
+		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
+	}
+	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
+		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
+	}
+}
+
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct {
 	// ≥20% while server fans were below 100% duty cycle — a signal that the
 	// cooling system may not be correctly configured for full GPU load.
 	CoolingWarning string
+	// MetricRows holds the telemetry rows from the final (converged) attempt
+	// for this GPU. Used to build per-run gpu-metrics.csv.
+	MetricRows []GPUMetricRow
 }

 type benchmarkBurnProfile struct {
@@ -2781,7 +2784,7 @@ func runBenchmarkPowerCalibration(
 	infoByIndex map[int]benchmarkGPUInfo,
 	logFunc func(string),
 	seedLimits map[int]int,
-) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
+) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
 	const calibDurationSec = 120
 	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
@@ -2795,7 +2798,7 @@ func runBenchmarkPowerCalibration(

 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
-		return map[int]benchmarkPowerCalibrationResult{}, nil
+		return map[int]benchmarkPowerCalibrationResult{}, nil, nil
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 {
 		for _, p := range killed {
@@ -2829,6 +2832,8 @@ func runBenchmarkPowerCalibration(

 	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
 	var restore []benchmarkRestoreAction
+	var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts
+	var calibCursor float64

 	// Initialise per-GPU state.
 	states := make([]*gpuCalibState, 0, len(gpuIndices))
@@ -2981,6 +2986,8 @@ calibDone:
 		ticker.Stop()
 		cancelAttempt()
 		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
+		// Accumulate telemetry rows with attempt stage label.
+		appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))

 		// Resource busy: retry with exponential back-off (shared — one DCGM session).
 		if ar.err != nil && isDCGMResourceBusy(ar.err) {
@@ -3065,6 +3072,7 @@ calibDone:
 						}
 					}
 				}
+				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
 				continue
 			}
@@ -3103,6 +3111,7 @@ calibDone:
 				} else {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				}
+				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
 				continue
 			}
@@ -3140,7 +3149,8 @@ calibDone:
 			results[s.idx] = s.calib
 		}
 	}
-	return results, restore
+	writeBenchmarkMetricsFiles(runDir, allCalibRows)
+	return results, restore, allCalibRows
 }

 // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3230,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	}
 	if len(result.RampSteps) > 0 {
 		b.WriteString("## Ramp Sequence\n\n")
-		b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
-		b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
+		b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
+		b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
 		for _, step := range result.RampSteps {
 			derated := "-"
 			if step.Derated {
 				derated = "⚠ yes"
 			}
-			fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
-				step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
+			serverDelta := "-"
+			if step.ServerDeltaW > 0 {
+				serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
+			}
+			fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
+				step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
 		}
 		b.WriteString("\n")
 	}
 	b.WriteString("## Per-Slot Results\n\n")
-	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
-	b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
+	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
+	b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
 	for _, gpu := range result.GPUs {
 		stableLimit := "-"
 		if gpu.StablePowerLimitW > 0 {
@@ -3254,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
 			}
 		}
-		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
-			gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
+		serverDelta := "-"
+		if gpu.ServerDeltaW > 0 {
+			serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
+			gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
 	}
 	b.WriteString("\n")
 	for _, gpu := range result.GPUs {
@@ -3284,11 +3302,19 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
 		fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
 		fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
 		fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
+		if step.ServerLoadedW > 0 {
+			fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW)
+			fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW)
+		}
 	}
 	for _, gpu := range result.GPUs {
 		if gpu.StablePowerLimitW > 0 {
 			fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
 		}
+		if gpu.ServerLoadedW > 0 {
+			fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW)
+			fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW)
+		}
 	}
 	if sp := result.ServerPower; sp != nil && sp.Available {
 		fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
@@ -3327,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	if infoErr != nil {
 		return "", infoErr
 	}
+	// Capture full nvidia-smi -q snapshot at the start of the run.
+	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
+		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
+	}
 	hostname, _ := os.Hostname()
 	result := NvidiaPowerBenchResult{
 		BenchmarkVersion:   benchmarkVersion,
@@ -3352,13 +3382,31 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
+	singleIPMILoadedW := make(map[int]float64, len(selected))
 	var allRestoreActions []benchmarkRestoreAction
+	// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
+	var allPowerRows []GPUMetricRow
+	var powerCursor float64
 	for _, idx := range selected {
 		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
-		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
+		ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
+		ipmiSingleDone := make(chan float64, 1)
+		go func() {
+			defer close(ipmiSingleDone)
+			if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
+				ipmiSingleDone <- w
+			}
+		}()
+		c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
+		appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
+		ipmiSingleCancel()
+		if w, ok := <-ipmiSingleDone; ok {
+			singleIPMILoadedW[idx] = w
+			logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
+		}
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
@@ -3383,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 				result.OverallStatus = "PARTIAL"
 			}
 		}
-		gpus = append(gpus, NvidiaPowerBenchGPU{
+		gpu := NvidiaPowerBenchGPU{
 			Index:               idx,
 			Name:                info.Name,
 			BusID:               info.BusID,
@@ -3396,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			Status:              status,
 			Notes:               append([]string(nil), calib.Notes...),
 			CoolingWarning:      calib.CoolingWarning,
-		})
+		}
+		if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 {
+			gpu.ServerLoadedW = w
+			gpu.ServerDeltaW = w - serverIdleW
+		}
+		if len(calib.MetricRows) > 0 {
+			t := summarizeBenchmarkTelemetry(calib.MetricRows)
+			gpu.Telemetry = &t
+		}
+		gpus = append(gpus, gpu)
 	}
 	sort.Slice(gpus, func(i, j int) bool {
 		if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
@@ -3445,20 +3502,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
 	stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))

-	// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
-	// server-side loaded power while GPUs are under stress. The goroutine is
-	// cancelled as soon as Phase 2 finishes, and the average is used to compare
-	// against PlatformMaxTDPW (GPU-reported stable limits sum).
+	// serverLoadedW tracks the IPMI server power from the final ramp step
+	// (all GPUs simultaneously loaded). Earlier steps' values are stored
+	// per-step in NvidiaPowerBenchStep.ServerLoadedW.
 	var serverLoadedW float64
 	var serverLoadedOK bool
-	ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
-	ipmiPhase2Done := make(chan float64, 1)
-	go func() {
-		defer close(ipmiPhase2Done)
-		if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
-			ipmiPhase2Done <- w
-		}
-	}()

 	// Step 1: reuse single-card calibration result directly.
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -3475,6 +3523,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			Derated:             firstCalib.Derated,
 			Status:              "OK",
 		}
+		if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 {
+			ramp.ServerLoadedW = w
+			ramp.ServerDeltaW = w - serverIdleW
+		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
 			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
@@ -3502,17 +3554,45 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		_ = os.MkdirAll(stepDir, 0755)

 		// Reuse the latest stable limits as starting points, but re-check every
-		// active GPU in this hotter configuration.
-		seedForStep := make(map[int]int, len(stableLimits))
-		for k, v := range stableLimits {
-			seedForStep[k] = v
+		// active GPU in this hotter configuration. For the newly introduced GPU,
+		// seed from its single-card calibration so we do not restart from the
+		// default TDP when a prior derated limit is already known.
+		seedForStep := make(map[int]int, len(subset))
+		for _, idx := range subset {
+			if lim, ok := stableLimits[idx]; ok && lim > 0 {
+				seedForStep[idx] = lim
+				continue
+			}
+			if base, ok := calibByIndex[idx]; ok {
+				lim := int(math.Round(base.AppliedPowerLimitW))
+				if lim > 0 {
+					seedForStep[idx] = lim
+				}
+			}
 		}

 		logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
 			step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))

 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
-		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
+		ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
+		ipmiStepDone := make(chan float64, 1)
+		go func() {
+			defer close(ipmiStepDone)
+			if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
+				ipmiStepDone <- w
+			}
+		}()
+		stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
+		appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
+		ipmiStepCancel()
+		var stepIPMILoadedW float64
+		var stepIPMIOK bool
+		if w, ok := <-ipmiStepDone; ok {
+			stepIPMILoadedW = w
+			stepIPMIOK = true
+			logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
+		}
 		// Accumulate restore actions; they all run in the outer defer.
 		allRestoreActions = append(allRestoreActions, stepRestore...)

@@ -3575,15 +3655,17 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 		}

-		result.RampSteps = append(result.RampSteps, ramp)
-	}
+		if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
+			ramp.ServerLoadedW = stepIPMILoadedW
+			ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
+			// The last step has all GPUs loaded — use it as the top-level loaded_w.
+			if step == len(result.RecommendedSlotOrder) {
+				serverLoadedW = stepIPMILoadedW
+				serverLoadedOK = true
+			}
+		}

-	// Stop IPMI Phase 2 sampling and collect result.
-	ipmiPhase2Cancel()
-	if w, ok := <-ipmiPhase2Done; ok {
-		serverLoadedW = w
-		serverLoadedOK = true
-		logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
+		result.RampSteps = append(result.RampSteps, ramp)
 	}

 	// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
@@ -3613,6 +3695,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	//   ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
 	_ = serverIdleOK // used implicitly via characterizeServerPower
 	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
+	// Write top-level gpu-metrics.csv/.html aggregating all phases.
+	writeBenchmarkMetricsFiles(runDir, allPowerRows)
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("marshal power result: %w", err)
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -331,6 +331,13 @@ type NvidiaPowerBenchGPU struct {
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// Telemetry holds the aggregated stats from the final converged calibration
+	// attempt for this GPU (temperature, power, fan, clock percentiles).
+	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
 }

 type NvidiaPowerBenchStep struct {
@@ -345,6 +352,10 @@ type NvidiaPowerBenchStep struct {
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	Notes               []string `json:"notes,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
 }

 // NvidiaPerformanceRampStep holds per-step performance data for the
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	return string(raw), err
 }

-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-	// detect GPU count
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
 	), logFunc)
 }

--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
+	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
+	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
+	if len(cmd) != len(want) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
+	}
+	for i := range want {
+		if cmd[i] != want[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
+		}
+	}
+}
+
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
 			`<code>all_reduce_perf</code> (NCCL tests)`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
 			`<code>nvbandwidth</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
 		)) +
 		`</div>` +
 		`</div>
@@ -1527,8 +1527,6 @@ function satModeChanged() {
    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
-    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
-    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
@@ -1776,7 +1774,7 @@ function runAllSAT() {
  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -2016,9 +2014,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
 // ── Benchmark ─────────────────────────────────────────────────────────────────

 type benchmarkHistoryRun struct {
-	generatedAt time.Time
-	displayTime string
-	gpuScores   map[int]float64 // GPU index → composite score
+	generatedAt  time.Time
+	displayTime  string
+	gpuScores    map[int]float64 // GPU index → composite score
+	gpuStatuses  map[int]string  // GPU index → status ("OK", "WARNING", "FAILED", …)
+	overallStatus string
 }

 func renderBenchmark(opts HandlerOptions) string {
@@ -2082,7 +2082,7 @@ func renderBenchmark(opts HandlerOptions) string {
  </div>
 </div>

-`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
+` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `

 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2326,7 +2326,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
 	}
 	b.WriteString(`<div style="overflow-x:auto">`)
-	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
+	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
 	for i := 0; i <= maxGPUIndex; i++ {
 		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
 	}
@@ -2335,13 +2335,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<tr>`)
 		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+		overallColor := "var(--ok)"
+		overallLabel := run.overallStatus
+		if overallLabel == "" {
+			overallLabel = "OK"
+		}
+		if overallLabel == "FAILED" {
+			overallColor = "var(--crit-fg,#9f3a38)"
+		} else if overallLabel != "OK" {
+			overallColor = "var(--warn)"
+		}
+		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
 		for idx := 0; idx <= maxGPUIndex; idx++ {
 			score, ok := run.gpuScores[idx]
 			if !ok {
 				b.WriteString(`<td style="color:var(--muted)">-</td>`)
 				continue
 			}
-			b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
+			gpuStatus := run.gpuStatuses[idx]
+			scoreColor := ""
+			switch gpuStatus {
+			case "FAILED":
+				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
+			case "WARNING", "PARTIAL":
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			case "", "OK":
+				// no override
+			default:
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			}
+			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
 		}
 		b.WriteString(`</tr>`)
 	}
@@ -2375,12 +2398,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
 			continue
 		}
 		run := benchmarkHistoryRun{
-			generatedAt: result.GeneratedAt,
-			displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
-			gpuScores:   make(map[int]float64),
+			generatedAt:   result.GeneratedAt,
+			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			gpuScores:     make(map[int]float64),
+			gpuStatuses:   make(map[int]string),
+			overallStatus: result.OverallStatus,
 		}
 		for _, gpu := range result.GPUs {
 			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
+			run.gpuStatuses[gpu.Index] = gpu.Status
 			if gpu.Index > maxGPUIndex {
 				maxGPUIndex = gpu.Index
 			}
@@ -2449,31 +2475,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {

 	if len(latest.GPUs) > 0 {
 		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
-		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
 		b.WriteString(`</tr></thead><tbody>`)
 		for _, gpu := range latest.GPUs {
-			derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
+			// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
+			// falling back to single-card applied limit if the ramp hasn't run.
+			finalLimitW := gpu.StablePowerLimitW
+			if finalLimitW <= 0 {
+				finalLimitW = gpu.AppliedPowerLimitW
+			}
+			// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
+			derated := gpu.Derated ||
+				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
 			rowStyle := ""
-			achievedStyle := ""
+			finalStyle := ""
 			if derated {
 				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
-				achievedStyle = ` style="color:#e6a000;font-weight:600"`
+				finalStyle = ` style="color:#e6a000;font-weight:600"`
 			}
 			statusLabel := gpu.Status
 			if statusLabel == "" {
 				statusLabel = "OK"
 			}
 			statusColor := "var(--ok)"
-			if statusLabel != "OK" {
+			if statusLabel == "FAILED" {
+				statusColor = "var(--crit-fg,#9f3a38)"
+			} else if statusLabel != "OK" {
 				statusColor = "var(--warn)"
 			}
 			nominalStr := "-"
 			if gpu.DefaultPowerLimitW > 0 {
 				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
 			}
-			achievedStr := "-"
+			singleStr := "-"
 			if gpu.AppliedPowerLimitW > 0 {
-				achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+			}
+			multiStr := "-"
+			if gpu.StablePowerLimitW > 0 {
+				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
 			}
 			p95Str := "-"
 			if gpu.MaxObservedPowerW > 0 {
@@ -2483,7 +2523,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
 			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
 			b.WriteString(`<td>` + nominalStr + `</td>`)
-			b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
+			b.WriteString(`<td>` + singleStr + `</td>`)
+			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
 			b.WriteString(`<td>` + p95Str + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
 			b.WriteString(`</tr>`)
@@ -2517,7 +2558,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {

 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

 <div class="card" style="margin-bottom:16px">
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	}
 }

+func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`NVIDIA Interconnect (NCCL)`,
+		`Runs in Validate and Stress.`,
+		`NVIDIA Bandwidth (NVBandwidth)`,
+		`Intended to stay short enough for Validate.`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("validate page missing %q: %s", needle, body)
+		}
+	}
+}
+
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -736,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
-		if t.params.BurnProfile != "" && dur <= 0 {
-			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
-		}
-		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
-			DurationSec: dur,
-			Loader:      platform.NvidiaStressLoaderNCCL,
-			GPUIndices:  t.params.GPUIndices,
-		}, j.append)
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -1,117 +0,0 @@
-#!/bin/sh
-# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
-set -e
-echo "=== generating bee wallpaper ==="
-mkdir -p /usr/share/bee
-
-python3 - <<'PYEOF'
-from PIL import Image, ImageDraw, ImageFont, ImageFilter
-import os
-
-W, H = 1920, 1080
-
-ASCII_ART = [
-    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
-    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
-    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
-    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
-    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
-    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
-]
-SUBTITLE = "  Hardware Audit LiveCD"
-
-FG = (0xF6, 0xD0, 0x47)
-FG_DIM = (0xD4, 0xA9, 0x1C)
-SHADOW = (0x5E, 0x47, 0x05)
-SUB = (0x96, 0x7A, 0x17)
-BG = (0x05, 0x05, 0x05)
-
-MONO_FONT_CANDIDATES = [
-    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
-    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
-]
-SUB_FONT_CANDIDATES = [
-    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
-    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
-]
-
-
-def load_font(candidates, size):
-    for path in candidates:
-        if os.path.exists(path):
-            return ImageFont.truetype(path, size)
-    return ImageFont.load_default()
-
-
-def mono_metrics(font):
-    probe = Image.new('L', (W, H), 0)
-    draw = ImageDraw.Draw(probe)
-    char_w = int(round(draw.textlength("M", font=font)))
-    bb = draw.textbbox((0, 0), "Mg", font=font)
-    char_h = bb[3] - bb[1]
-    return char_w, char_h
-
-
-def render_ascii_mask(font, lines, char_w, char_h, line_gap):
-    width = max(len(line) for line in lines) * char_w
-    height = len(lines) * char_h + line_gap * (len(lines) - 1)
-    mask = Image.new('L', (width, height), 0)
-    draw = ImageDraw.Draw(mask)
-    for row, line in enumerate(lines):
-        y = row * (char_h + line_gap)
-        for col, ch in enumerate(line):
-            if ch == ' ':
-                continue
-            x = col * char_w
-            draw.text((x, y), ch, font=font, fill=255)
-    return mask
-
-
-img = Image.new('RGB', (W, H), BG)
-draw = ImageDraw.Draw(img)
-
-# Soft amber glow under the logo without depending on font rendering.
-glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
-glow_draw = ImageDraw.Draw(glow)
-glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
-glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
-glow = glow.filter(ImageFilter.GaussianBlur(60))
-img = Image.alpha_composite(img.convert('RGBA'), glow)
-
-TARGET_LOGO_W = 400
-max_chars = max(len(line) for line in ASCII_ART)
-_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
-_probe_cw, _ = mono_metrics(_probe_font)
-font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
-font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
-char_w, char_h = mono_metrics(font_logo)
-logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
-logo_w, logo_h = logo_mask.size
-logo_x = (W - logo_w) // 2
-logo_y = 380
-
-sh_off = max(1, font_size_logo // 6)
-shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
-img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
-img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
-img.paste(FG, (logo_x, logo_y), logo_mask)
-
-font_sub = load_font(SUB_FONT_CANDIDATES, 30)
-sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
-sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
-sub_y = logo_y + logo_h + 48
-draw = ImageDraw.Draw(img)
-draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
-draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
-
-img = img.convert('RGB')
-
-img.save('/usr/share/bee/wallpaper.png', optimize=True)
-print('wallpaper written: /usr/share/bee/wallpaper.png')
-PYEOF
-
-echo "=== wallpaper done ==="
Author	SHA1	Message	Date
Mikhail Chusavitin	5285c0d101	Capture per-run IPMI power and GPU telemetry in power benchmark - Sample IPMI loaded_w per single-card calibration and per ramp step instead of averaging over the entire Phase 2; top-level ServerPower uses the final (all-GPU) ramp step value - Add ServerLoadedW/ServerDeltaW to NvidiaPowerBenchGPU and NvidiaPowerBenchStep so external tooling can compare wall power per phase without re-parsing logs - Write gpu-metrics.csv/.html inside each single-XX/ and step-XX/ subdir; aggregate all phases into a top-level gpu-metrics.csv/.html - Write 00-nvidia-smi-q.log at the start of every power run - Add Telemetry (p95 temp/power/fan/clock) to NvidiaPowerBenchGPU in result.json from the converged calibration attempt - Power benchmark page: split "Achieved W" into Single-card W and Multi-GPU W (StablePowerLimitW); derate highlight and status color now reflect the final multi-GPU limit vs nominal - Performance benchmark page: add Status column and per-GPU score color coding (green/yellow/red) based on gpu.Status and OverallStatus Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-17 17:59:58 +03:00
Mikhail Chusavitin	dca4afb8d0	Seed power ramp with single-card TDP limits	2026-04-16 11:43:01 +03:00
Mikhail Chusavitin	b4280941f5	Move NCCL and NVBandwidth into validate mode	2026-04-16 11:02:30 +03:00
Mikhail Chusavitin	f74976ec4c	Use static overlay wallpaper in ISO build	2026-04-16 10:54:03 +03:00