Seed power ramp with single-card TDP limits

Move NCCL and NVBandwidth into validate mode
Use static overlay wallpaper in ISO build
2026-04-16 11:43:01 +03:00 · 2026-04-16 11:02:30 +03:00 · 2026-04-16 10:54:03 +03:00 · 2026-04-16 10:10:18 +03:00 · 2026-04-16 10:00:03 +03:00 · 2026-04-16 09:58:02 +03:00
16 changed files with 665 additions and 267 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -146,7 +146,7 @@ type satRunner interface {
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 }
 type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
 	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
 }
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -128,6 +128,7 @@ type fakeSAT struct {
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
 	runNvidiaPulseFn          func(string, int, []int) (string, error)
 	runNvidiaBandwidthFn      func(string, []int) (string, error)
 	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }
-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNCCLFn != nil {
 		return f.runNCCLFn(baseDir, gpuIndices)
 	}
 	return "", nil
 }
 func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
 	t.Parallel()
 	var gotBaseDir string
 	var gotGPUIndices []int
 	a := &App{
 		sat: fakeSAT{
 			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
 				gotBaseDir = baseDir
 				gotGPUIndices = append([]int(nil), gpuIndices...)
 				return "/tmp/nccl-tests.tar.gz", nil
 			},
 		},
 	}
 	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
 	if err != nil {
 		t.Fatalf("RunNCCLTests error: %v", err)
 	}
 	if path != "/tmp/nccl-tests.tar.gz" {
 		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
 	}
 	if gotBaseDir != "/tmp/sat" {
 		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
 	}
 	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
 		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
 	}
 }
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -94,9 +94,13 @@ var (
 )
 // benchmarkPrecisionPhases lists the precision categories run as individual
-// steady-state windows before the combined steady pass.  Order is from lowest
+// steady-state windows before the combined steady pass. Order is from lowest
 // to highest power draw so thermal ramp-up is gradual.
-var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
+//
 // fp64 and fp4 are intentionally disabled for now: both are currently unstable
 // on the target fleet and can abort the mixed steady stage after the earlier
 // phases already collected useful telemetry.
 var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32"}
 func computeCapabilityCode(raw string) int {
 	raw = strings.TrimSpace(raw)
@@ -124,6 +128,15 @@ func benchmarkSupportedPrecisions(computeCapability string) []string {
 	return out
 }
 func benchmarkPrecisionEnabled(category string) bool {
 	switch category {
 	case "int8", "fp8", "fp16", "fp16_bf16", "fp32", "fp32_tf32":
 		return true
 	default:
 		return false
 	}
 }
 func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
 	if len(precisions) == 0 {
 		precisions = append([]string(nil), benchmarkPrecisionPhases...)
@@ -514,6 +527,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
 			}
 			applyBenchmarkSteadyFallback(&gpuResult)
 			gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
 			gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
 			if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
@@ -1108,6 +1122,7 @@ type benchmarkCoolingSample struct {
 	AvgFanRPM             float64
 	AvgFanDutyCyclePct    float64
 	FanDutyCycleAvailable bool
 	FanDutyCycleEstimated bool
 }
 func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
@@ -1120,6 +1135,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
 		samples[i].FanAvgRPM = fanSample.AvgFanRPM
 		samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
 		samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
 		samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
 	}
 	return samples, nil
 }
@@ -1127,11 +1143,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
 func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
 	fans, _ := sampleFanSpeeds()
 	avgRPM, _, _ := fanRPMStats(fans)
-	dutyPct, dutyAvailable := sampleFanDutyCyclePct()
+	dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
 	return benchmarkCoolingSample{
 		AvgFanRPM:             avgRPM,
 		AvgFanDutyCyclePct:    dutyPct,
 		FanDutyCycleAvailable: dutyAvailable,
 		FanDutyCycleEstimated: dutyEstimated,
 	}
 }
@@ -1373,44 +1390,91 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
 	}
 	var rpmValues []float64
 	var dutyValues []float64
 	var dutyEstimated bool
 	for _, row := range rows {
 		if row.FanAvgRPM > 0 {
 			rpmValues = append(rpmValues, row.FanAvgRPM)
 		}
 		if row.FanDutyCycleAvailable {
 			dutyValues = append(dutyValues, row.FanDutyCyclePct)
 			if row.FanDutyCycleEstimated {
 				dutyEstimated = true
 			}
 		}
 	}
 	if len(rpmValues) == 0 && len(dutyValues) == 0 {
 		return nil
 	}
 	summary := &BenchmarkCoolingSummary{
-		Available: true,
+		Available:             true,
-		AvgFanRPM: benchmarkMean(rpmValues),
+		AvgFanRPM:             benchmarkMean(rpmValues),
 		FanDutyCycleEstimated: dutyEstimated,
 	}
 	if len(dutyValues) > 0 {
 		summary.FanDutyCycleAvailable = true
 		summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
 		summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
 		if summary.FanDutyCycleEstimated {
 			summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
 		}
 	} else {
 		summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
 	}
 	return summary
 }
 func benchmarkTelemetryAvailable(summary BenchmarkTelemetrySummary) bool {
 	return summary.Samples > 0 || summary.DurationSec > 0
 }
 func benchmarkPrecisionSteadyFallback(phases []BenchmarkPrecisionSteadyPhase) (BenchmarkTelemetrySummary, string, bool) {
 	var (
 		best      BenchmarkTelemetrySummary
 		bestLabel string
 		found     bool
 	)
 	for _, phase := range phases {
 		if !benchmarkTelemetryAvailable(phase.Steady) {
 			continue
 		}
 		if !found ||
 			phase.Steady.DurationSec > best.DurationSec ||
 			(phase.Steady.DurationSec == best.DurationSec && phase.Steady.P95PowerW > best.P95PowerW) {
 			best = phase.Steady
 			bestLabel = phase.Precision
 			found = true
 		}
 	}
 	return best, bestLabel, found
 }
 func applyBenchmarkSteadyFallback(gpu *BenchmarkGPUResult) {
 	if gpu == nil || benchmarkTelemetryAvailable(gpu.Steady) {
 		return
 	}
 	if fallback, label, ok := benchmarkPrecisionSteadyFallback(gpu.PrecisionSteady); ok {
 		gpu.Steady = fallback
 		gpu.Notes = append(gpu.Notes,
 			fmt.Sprintf("mixed steady telemetry unavailable; reporting steady-state fallback from %s precision phase", label))
 	}
 }
 func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 	score := BenchmarkScorecard{}
 	// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
 	// Each precision ran alone with full GPU dedicated — peak capability.
 	for _, p := range gpu.PrecisionSteady {
 		if !benchmarkPrecisionEnabled(p.Precision) {
 			continue
 		}
 		score.SyntheticScore += p.WeightedTeraOpsPerSec
 	}
 	// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
 	// All precisions compete simultaneously — closer to real inference workloads.
 	for _, p := range gpu.PrecisionResults {
-		if p.Supported {
+		if p.Supported && benchmarkPrecisionEnabled(p.Category) {
 			score.MixedScore += p.WeightedTeraOpsPerSec
 		}
 	}
@@ -1441,10 +1505,17 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 	// so CV reflects genuine power regulation, not workload switching).
 	if len(gpu.PrecisionSteady) > 0 {
 		var sum float64
 		var count int
 		for _, p := range gpu.PrecisionSteady {
 			if !benchmarkPrecisionEnabled(p.Precision) {
 				continue
 			}
 			sum += clampScore(100 - p.Steady.PowerCVPct*3)
 			count++
 		}
 		if count > 0 {
 			score.PowerSustainScore = sum / float64(count)
 		}
 		score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady))
 	} else if gpu.Steady.PowerCVPct > 0 {
 		score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
 	}
@@ -2512,6 +2583,7 @@ func runNvidiaBenchmarkParallel(
 	// Score and finalize each GPU.
 	for _, idx := range selected {
 		r := gpuResults[idx]
 		applyBenchmarkSteadyFallback(r)
 		r.Scores = scoreBenchmarkGPUResult(*r)
 		r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
 		pr := parseResults[idx]
@@ -2694,18 +2766,21 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
 	return cl
 }
-// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
+// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
-// throttle counters. If a GPU starts throttling, the current targeted_power run
+// actively watches throttle counters. seedLimits, when provided, are treated as
-// is canceled immediately, the power limit is reduced, and a fresh full cycle
+// the starting point for this calibration pass rather than as immutable fixed
-// is started again from the beginning. The selected reduced power limit stays
+// limits. This matters during cumulative ramp-up: once an additional GPU is
-// active for the main benchmark and is restored by the caller afterwards.
+// introduced, every already-active GPU must be revalidated under the new
 // thermal state instead of assuming its previous single-step limit is still
 // valid. The selected reduced power limits stay active for the main benchmark
 // and are restored by the caller afterwards.
 func runBenchmarkPowerCalibration(
 	ctx context.Context,
 	verboseLog, runDir string,
 	gpuIndices []int,
 	infoByIndex map[int]benchmarkGPUInfo,
 	logFunc func(string),
-	fixedLimits map[int]int,
+	seedLimits map[int]int,
 ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
 	const calibDurationSec = 120
 	const maxDerateW = 150
@@ -2739,7 +2814,6 @@ func runBenchmarkPowerCalibration(
 		err  error
 	}
 	// gpuCalibState holds per-GPU binary search state during parallel calibration.
 	type gpuCalibState struct {
 		idx            int
@@ -2796,19 +2870,20 @@ func runBenchmarkPowerCalibration(
 			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
 			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
-		if fixedLimits != nil {
+		if seedLimits != nil {
-			if fixedW, ok := fixedLimits[idx]; ok {
+			if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
-				// This GPU's limit was established in a prior ramp step and must
+				// A previously validated limit is only a starting point. Re-run
-				// remain unchanged. Apply it immediately and skip the binary search.
+				// targeted_power under the current multi-GPU thermal load and derate
-				if canDerate && fixedW > 0 {
+				// again if this step shows new throttling.
-					_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW)
+				if canDerate {
 					_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
 				}
-				s.appliedLimitW = fixedW
+				s.appliedLimitW = seedW
-				s.calib.AppliedPowerLimitW = float64(fixedW)
+				s.hi = seedW + 1
-				s.calib.Completed = true
+				s.calib.AppliedPowerLimitW = float64(seedW)
-				s.converged = true
+				s.calib.Derated = seedW < s.originalLimitW
 				s.calib.Notes = append(s.calib.Notes,
-					fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW))
+					fmt.Sprintf("seed limit: %d W (revalidating under current thermal load)", seedW))
 			}
 		}
 		states = append(states, s)
@@ -3091,7 +3166,6 @@ func powerBenchDurationSec(profile string) int {
 	}
 }
 func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
 	out := make(map[int]benchmarkGPUInfo, len(src))
 	for k, v := range src {
@@ -3107,7 +3181,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
-	fmt.Fprintf(&b, "**Platform max TDP:** %.0f W  \n\n", result.PlatformMaxTDPW)
+	fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W  \n", result.PlatformMaxTDPW)
 	if sp := result.ServerPower; sp != nil && sp.Available {
 		fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W  \n", sp.DeltaW)
 		fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f  \n", sp.ReportingRatio)
 	}
 	b.WriteString("\n")
 	// Server power comparison table.
 	if sp := result.ServerPower; sp != nil {
 		b.WriteString("## Server vs GPU Power Comparison\n\n")
 		b.WriteString("| Metric | Value |\n")
 		b.WriteString("|--------|-------|\n")
 		fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
 		if sp.Available {
 			fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
 			fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
 			fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | %.0f W |\n", sp.DeltaW)
 			ratio := sp.ReportingRatio
 			ratioNote := ""
 			switch {
 			case ratio >= 0.9:
 				ratioNote = "✓ GPU telemetry matches server power"
 			case ratio >= 0.75:
 				ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
 			default:
 				ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
 			}
 			fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
 		} else {
 			b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
 		}
 		for _, note := range sp.Notes {
 			fmt.Fprintf(&b, "\n> %s\n", note)
 		}
 		b.WriteString("\n")
 	}
 	if len(result.Findings) > 0 {
 		b.WriteString("## Summary\n\n")
 		for _, finding := range result.Findings {
@@ -3181,6 +3290,12 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
 			fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
 		}
 	}
 	if sp := result.ServerPower; sp != nil && sp.Available {
 		fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
 		fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW)
 		fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW)
 		fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio)
 	}
 	return b.String()
 }
@@ -3224,6 +3339,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)
 	_ = durationSec
 	// Sample IPMI idle power before any GPU load.
 	var serverIdleW float64
 	var serverIdleOK bool
 	if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
 		serverIdleW = w
 		serverIdleOK = true
 		logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
 	}
 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
@@ -3320,20 +3445,35 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
 	stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
 	// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
 	// server-side loaded power while GPUs are under stress. The goroutine is
 	// cancelled as soon as Phase 2 finishes, and the average is used to compare
 	// against PlatformMaxTDPW (GPU-reported stable limits sum).
 	var serverLoadedW float64
 	var serverLoadedOK bool
 	ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
 	ipmiPhase2Done := make(chan float64, 1)
 	go func() {
 		defer close(ipmiPhase2Done)
 		if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
 			ipmiPhase2Done <- w
 		}
 	}()
 	// Step 1: reuse single-card calibration result directly.
 	if len(result.RecommendedSlotOrder) > 0 {
 		firstIdx := result.RecommendedSlotOrder[0]
 		firstCalib := calibByIndex[firstIdx]
 		stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
 		ramp := NvidiaPowerBenchStep{
-			StepIndex:         1,
+			StepIndex:           1,
-			GPUIndices:        []int{firstIdx},
+			GPUIndices:          []int{firstIdx},
-			NewGPUIndex:       firstIdx,
+			NewGPUIndex:         firstIdx,
-			NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
+			NewGPUStableLimitW:  firstCalib.AppliedPowerLimitW,
 			TotalObservedPowerW: firstCalib.Summary.P95PowerW,
 			AvgObservedPowerW:   firstCalib.Summary.P95PowerW,
-			Derated:           firstCalib.Derated,
+			Derated:             firstCalib.Derated,
-			Status:            "OK",
+			Status:              "OK",
 		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
@@ -3351,8 +3491,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
 	}
-	// Steps 2..N: each step fixes previously calibrated GPUs and searches only
+	// Steps 2..N: each step revalidates every already-active GPU under the new
-	// the new GPU's stable limit in the combined thermal environment.
+	// cumulative thermal environment and also calibrates the newly introduced
 	// GPU. Previously found limits are used only as seeds for the search.
 	for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
 		step := stepNum + 1
 		subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
@@ -3360,17 +3501,29 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
 		_ = os.MkdirAll(stepDir, 0755)
-		// All previously calibrated GPUs are fixed at their stable limits.
+		// Reuse the latest stable limits as starting points, but re-check every
-		fixedForStep := make(map[int]int, len(stableLimits))
+		// active GPU in this hotter configuration. For the newly introduced GPU,
-		for k, v := range stableLimits {
+		// seed from its single-card calibration so we do not restart from the
-			fixedForStep[k] = v
+		// default TDP when a prior derated limit is already known.
 		seedForStep := make(map[int]int, len(subset))
 		for _, idx := range subset {
 			if lim, ok := stableLimits[idx]; ok && lim > 0 {
 				seedForStep[idx] = lim
 				continue
 			}
 			if base, ok := calibByIndex[idx]; ok {
 				lim := int(math.Round(base.AppliedPowerLimitW))
 				if lim > 0 {
 					seedForStep[idx] = lim
 				}
 			}
 		}
-		logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)",
+		logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
-			step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep)))
+			step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
-		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep)
+		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
 		// Accumulate restore actions; they all run in the outer defer.
 		allRestoreActions = append(allRestoreActions, stepRestore...)
@@ -3391,36 +3544,72 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
 		}
-		// Determine stable limit for the new GPU.
+		for _, idx := range subset {
-		if c, ok := stepCalib[newGPUIdx]; ok && c.Completed {
+			c, ok := stepCalib[idx]
-			stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW))
+			if !ok || !c.Completed {
-			ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
+				fallback := 0
-			ramp.Derated = c.Derated
+				if lim, ok := stableLimits[idx]; ok && lim > 0 {
 					fallback = lim
 				} else if fb, ok := calibByIndex[idx]; ok {
 					fallback = int(math.Round(fb.AppliedPowerLimitW))
 				}
 				if fallback > 0 {
 					stableLimits[idx] = fallback
 				}
 				ramp.Status = "FAILED"
 				ramp.Notes = append(ramp.Notes,
 					fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
 				result.OverallStatus = "PARTIAL"
 				continue
 			}
 			prevLimit, hadPrev := stableLimits[idx]
 			newLimit := int(math.Round(c.AppliedPowerLimitW))
 			stableLimits[idx] = newLimit
 			if idx == newGPUIdx {
 				ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
 				ramp.Derated = c.Derated
 			}
 			if c.Derated {
 				ramp.Status = "PARTIAL"
 				if result.OverallStatus == "OK" {
 					result.OverallStatus = "PARTIAL"
 				}
 				result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 			}
-		} else {
+			if hadPrev && newLimit < prevLimit {
-			// Calibration failed — fall back to single-card limit.
+				ramp.Notes = append(ramp.Notes,
-			fb := calibByIndex[newGPUIdx]
+					fmt.Sprintf("GPU %d was re-derated from %d W to %d W under combined thermal load.", idx, prevLimit, newLimit))
-			stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW))
+			}
-			ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW
+		}
-			ramp.Status = "FAILED"
+
-			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW))
+		if c, ok := stepCalib[newGPUIdx]; ok && c.Completed && c.Derated {
-			result.OverallStatus = "PARTIAL"
+			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 		}
 		result.RampSteps = append(result.RampSteps, ramp)
 	}
 	// Stop IPMI Phase 2 sampling and collect result.
 	ipmiPhase2Cancel()
 	if w, ok := <-ipmiPhase2Done; ok {
 		serverLoadedW = w
 		serverLoadedOK = true
 		logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
 	}
 	// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
 	for i := range result.GPUs {
 		if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
 			result.GPUs[i].StablePowerLimitW = float64(lim)
 		}
 		if result.GPUs[i].StablePowerLimitW > 0 && result.GPUs[i].AppliedPowerLimitW > 0 &&
 			result.GPUs[i].StablePowerLimitW < result.GPUs[i].AppliedPowerLimitW {
 			result.GPUs[i].Derated = true
 			result.Findings = append(result.Findings, fmt.Sprintf(
 				"GPU %d required additional derating from %.0f W (single-card) to %.0f W under full-system thermal load.",
 				result.GPUs[i].Index, result.GPUs[i].AppliedPowerLimitW, result.GPUs[i].StablePowerLimitW,
 			))
 		}
 	}
 	// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
@@ -3428,6 +3617,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	for _, lim := range stableLimits {
 		result.PlatformMaxTDPW += float64(lim)
 	}
 	// Characterize server power from IPMI idle/loaded samples.
 	// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
 	// ReportingRatio = IPMI_delta / GPU_reported_sum:
 	//   ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
 	_ = serverIdleOK // used implicitly via characterizeServerPower
 	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("marshal power result: %w", err)
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -261,14 +261,18 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 		// Steady-state telemetry
-		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+		if benchmarkTelemetryAvailable(gpu.Steady) {
-		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
+			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
-		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
+			b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
-		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
+			fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
-		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
+			fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
-		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
+			fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
-		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+			fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
-		b.WriteString("\n")
+			fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
 			b.WriteString("\n")
 		} else {
 			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
 		}
 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -49,8 +49,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
-	if len(labels) != 7 || len(phases) != 7 {
+	if len(labels) != 5 || len(phases) != 5 {
-		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
+		t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
 	}
 	if basePhaseSec != 60 {
 		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
@@ -61,7 +61,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
 		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
 	}
-	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
+	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -80,7 +80,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	if mixedPhaseSec != 3600 {
 		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
+	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -99,7 +99,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	if mixedPhaseSec != 14400 {
 		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
+	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -133,10 +133,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()
-	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
+	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
-	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
+	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
 }
@@ -314,6 +314,30 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
 	}
 }
 func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
 	t.Parallel()
 	score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
 		PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
 			{Precision: "fp16", WeightedTeraOpsPerSec: 100},
 			{Precision: "fp64", WeightedTeraOpsPerSec: 999},
 			{Precision: "fp4", WeightedTeraOpsPerSec: 999},
 		},
 		PrecisionResults: []BenchmarkPrecisionResult{
 			{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
 			{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
 			{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
 		},
 	})
 	if score.SyntheticScore != 100 {
 		t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
 	}
 	if score.MixedScore != 50 {
 		t.Fatalf("MixedScore=%f want 50", score.MixedScore)
 	}
 }
 func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
 	Available             bool     `json:"available"`
 	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
 	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
 	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
 	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
 	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
@@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct {
 }
 type NvidiaBenchmarkResult struct {
-	BenchmarkVersion   string                       `json:"benchmark_version"`
+	BenchmarkVersion string    `json:"benchmark_version"`
-	GeneratedAt        time.Time                    `json:"generated_at"`
+	GeneratedAt      time.Time `json:"generated_at"`
-	Hostname           string                       `json:"hostname,omitempty"`
+	Hostname         string    `json:"hostname,omitempty"`
-	ServerModel        string                       `json:"server_model,omitempty"`
+	ServerModel      string    `json:"server_model,omitempty"`
-	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	BenchmarkProfile string    `json:"benchmark_profile"`
-	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
+	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
-	RampStep           int                          `json:"ramp_step,omitempty"`
+	RampStep         int       `json:"ramp_step,omitempty"`
-	RampTotal          int                          `json:"ramp_total,omitempty"`
+	RampTotal        int       `json:"ramp_total,omitempty"`
-	RampRunID          string                       `json:"ramp_run_id,omitempty"`
+	RampRunID        string    `json:"ramp_run_id,omitempty"`
-	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
+	ScalabilityScore float64   `json:"scalability_score,omitempty"`
 	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
 	// 100% = each added GPU contributes exactly its single-card throughput.
 	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
-	PlatformPowerScore   float64                    `json:"platform_power_score,omitempty"`
+	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
-	PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
-	OverallStatus      string                       `json:"overall_status"`
+	OverallStatus        string                       `json:"overall_status"`
-	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
+	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
-	Findings           []string                     `json:"findings,omitempty"`
+	Findings             []string                     `json:"findings,omitempty"`
-	Warnings           []string                     `json:"warnings,omitempty"`
+	Warnings             []string                     `json:"warnings,omitempty"`
-	Normalization      BenchmarkNormalization       `json:"normalization"`
+	Normalization        BenchmarkNormalization       `json:"normalization"`
-	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
-	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
-	Cooling            *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
+	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
-	GPUs               []BenchmarkGPUResult         `json:"gpus"`
+	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
-	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
-	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
 }
 type BenchmarkNormalization struct {
@@ -223,8 +224,8 @@ type BenchmarkScorecard struct {
 	// Throttle breakdown — percentage of steady-state time in each throttle type.
 	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
-	ThermalThrottlePct  float64 `json:"thermal_throttle_pct"`  // HW+SW thermal slowdown
+	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
-	PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
+	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
 	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
 	// Temperature headroom: distance to the 100°C destruction threshold.
@@ -300,18 +301,22 @@ type NvidiaPowerBenchResult struct {
 	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
 	// cumulative thermal ramp. Represents the actual sustained power budget of
 	// this server under full GPU load. Use for rack power planning.
-	PlatformMaxTDPW float64  `json:"platform_max_tdp_w"`
+	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
-	Findings        []string `json:"findings,omitempty"`
+	// ServerPower captures IPMI server power delta (idle→loaded) measured in
-	GPUs            []NvidiaPowerBenchGPU `json:"gpus"`
+	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
 	// actual wall-power draw as seen by the server's power supply.
 	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
 	Findings    []string              `json:"findings,omitempty"`
 	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
 }
 type NvidiaPowerBenchGPU struct {
-	Index               int      `json:"index"`
+	Index              int     `json:"index"`
-	Name                string   `json:"name,omitempty"`
+	Name               string  `json:"name,omitempty"`
-	BusID               string   `json:"bus_id,omitempty"`
+	BusID              string  `json:"bus_id,omitempty"`
-	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
+	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
 	// AppliedPowerLimitW is the stable limit found during single-card calibration.
-	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
 	// StablePowerLimitW is the final fixed limit for this GPU after the
 	// cumulative thermal ramp. This is the limit at which the GPU operated
 	// stably with all other GPUs running simultaneously at their own limits.
@@ -329,10 +334,10 @@ type NvidiaPowerBenchGPU struct {
 }
 type NvidiaPowerBenchStep struct {
-	StepIndex           int      `json:"step_index"`
+	StepIndex  int   `json:"step_index"`
-	GPUIndices          []int    `json:"gpu_indices"`
+	GPUIndices []int `json:"gpu_indices"`
 	// NewGPUIndex is the GPU whose stable limit was searched in this step.
-	NewGPUIndex         int      `json:"new_gpu_index"`
+	NewGPUIndex int `json:"new_gpu_index"`
 	// NewGPUStableLimitW is the stable power limit found for the new GPU.
 	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
 	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
@@ -345,15 +350,15 @@ type NvidiaPowerBenchStep struct {
 // NvidiaPerformanceRampStep holds per-step performance data for the
 // scalability ramp-up phase of the performance benchmark.
 type NvidiaPerformanceRampStep struct {
-	StepIndex          int      `json:"step_index"`
+	StepIndex  int   `json:"step_index"`
-	GPUIndices         []int    `json:"gpu_indices"`
+	GPUIndices []int `json:"gpu_indices"`
 	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
 	// TOPS from dedicated single-precision phases) across all GPUs in this step.
-	TotalSyntheticTOPS float64  `json:"total_synthetic_tops"`
+	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
-	TotalMixedTOPS     float64  `json:"total_mixed_tops,omitempty"`
+	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
 	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
 	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
-	ScalabilityPct     float64  `json:"scalability_pct"`
+	ScalabilityPct float64  `json:"scalability_pct"`
-	Status             string   `json:"status"`
+	Status         string   `json:"status"`
-	Notes              []string `json:"notes,omitempty"`
+	Notes          []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
 	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
 	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
 	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
 	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }
 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
 		dutyAvail := 0
 		if r.FanDutyCycleAvailable {
 			dutyAvail = 1
 		}
-		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
+		dutyEstimated := 0
-			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
+		if r.FanDutyCycleEstimated {
 			dutyEstimated = 1
 		}
 		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
 			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	return string(raw), err
 }
-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
-	// detect GPU count
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
+	if err != nil {
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+		return "", err
 	}
 	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
 	), logFunc)
 }
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
 	UpdatedAt time.Time
 }
 type fanObservationState struct {
 	MaxRPM map[string]float64 `json:"max_rpm"`
 }
 type fanPeakCandidate struct {
 	FirstSeen time.Time
 	RPM       float64
 }
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
 	fanObservationMu   sync.Mutex
 	fanObservation     fanObservationState
 	fanObservationInit bool
 	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )
 const systemPowerHoldTTL = 15 * time.Second
 var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
 const fanObservationMinPeakHold = time.Second
 func normalizeObservedFanMaxRPM(rpm float64) float64 {
 	if rpm <= 0 {
 		return 0
 	}
 	return math.Ceil(rpm/1000.0) * 1000.0
 }
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
 			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
 		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }
 func loadFanObservationLocked() {
 	if fanObservationInit {
 		return
 	}
 	fanObservationInit = true
 	fanObservation.MaxRPM = make(map[string]float64)
 	raw, err := os.ReadFile(fanObservationStatePath)
 	if err != nil || len(raw) == 0 {
 		return
 	}
 	var persisted fanObservationState
 	if json.Unmarshal(raw, &persisted) != nil {
 		return
 	}
 	for name, rpm := range persisted.MaxRPM {
 		name = strings.TrimSpace(name)
 		if name == "" || rpm <= 0 {
 			continue
 		}
 		fanObservation.MaxRPM[name] = rpm
 	}
 }
 func saveFanObservationLocked() {
 	if len(fanObservation.MaxRPM) == 0 {
 		return
 	}
 	dir := filepath.Dir(fanObservationStatePath)
 	if dir == "" || dir == "." {
 		dir = "/var/log/bee-sat"
 	}
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		return
 	}
 	raw, err := json.MarshalIndent(fanObservation, "", "  ")
 	if err != nil {
 		return
 	}
 	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
 }
 func updateFanObservation(fans []FanReading, now time.Time) {
 	if len(fans) == 0 {
 		return
 	}
 	fanObservationMu.Lock()
 	defer fanObservationMu.Unlock()
 	loadFanObservationLocked()
 	changed := false
 	for _, fan := range fans {
 		name := strings.TrimSpace(fan.Name)
 		if name == "" || fan.RPM <= 0 {
 			continue
 		}
 		currentMax := fanObservation.MaxRPM[name]
 		if fan.RPM <= currentMax {
 			delete(fanPeakCandidates, name)
 			continue
 		}
 		if cand, ok := fanPeakCandidates[name]; ok {
 			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
 				newMax := math.Max(cand.RPM, fan.RPM)
 				if newMax > currentMax {
 					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
 					changed = true
 				}
 				delete(fanPeakCandidates, name)
 				continue
 			}
 			if fan.RPM > cand.RPM {
 				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
 			}
 			continue
 		}
 		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
 	}
 	if changed {
 		saveFanObservationLocked()
 	}
 }
 func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
 	if len(fans) == 0 {
 		return 0, false
 	}
 	fanObservationMu.Lock()
 	defer fanObservationMu.Unlock()
 	loadFanObservationLocked()
 	var samples []float64
 	for _, fan := range fans {
 		name := strings.TrimSpace(fan.Name)
 		if name == "" || fan.RPM <= 0 {
 			continue
 		}
 		maxRPM := fanObservation.MaxRPM[name]
 		if maxRPM <= 0 {
 			continue
 		}
 		pct := fan.RPM / maxRPM * 100.0
 		if pct > 100 {
 			pct = 100
 		}
 		if pct < 0 {
 			pct = 0
 		}
 		samples = append(samples, pct)
 	}
 	if len(samples) == 0 {
 		return 0, false
 	}
 	return benchmarkMean(samples), true
 }
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
 // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
 // Returns the average duty cycle across all exposed PWM controls.
-func sampleFanDutyCyclePct() (float64, bool) {
+func sampleFanDutyCyclePct() (float64, bool, bool) {
 	out, err := exec.Command("sensors", "-j").Output()
 	if err != nil || len(out) == 0 {
-		return 0, false
+		fans, fanErr := sampleFanSpeeds()
 		if fanErr != nil {
 			return 0, false, false
 		}
 		return sampleFanDutyCyclePctFromFans(fans)
 	}
-	return parseFanDutyCyclePctSensorsJSON(out)
+	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
 	return pct, ok, false
 }
 func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
 	if len(fans) == 0 {
 		return 0, false, false
 	}
 	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
 		return pct, true, true
 	}
 	return 0, false, false
 }
 func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,7 @@
 package platform
 import (
 	"path/filepath"
 	"testing"
 	"time"
 )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
 	}
 }
 func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
 	t.Parallel()
 	oldPath := fanObservationStatePath
 	oldState := fanObservation
 	oldInit := fanObservationInit
 	oldCandidates := fanPeakCandidates
 	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
 	fanObservation = fanObservationState{}
 	fanObservationInit = false
 	fanPeakCandidates = make(map[string]fanPeakCandidate)
 	t.Cleanup(func() {
 		fanObservationStatePath = oldPath
 		fanObservation = oldState
 		fanObservationInit = oldInit
 		fanPeakCandidates = oldCandidates
 	})
 	start := time.Unix(100, 0)
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
 	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
 		t.Fatalf("single-sample spike should not establish observed max")
 	}
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
 	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
 	if !ok {
 		t.Fatalf("expected estimated duty cycle from persisted observed max")
 	}
 	if got < 43 || got > 44 {
 		t.Fatalf("got=%v want ~43.3", got)
 	}
 	fanObservation = fanObservationState{}
 	fanObservationInit = false
 	fanPeakCandidates = make(map[string]fanPeakCandidate)
 	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
 	if !ok {
 		t.Fatalf("expected persisted observed max to be reloaded from disk")
 	}
 	if got < 43 || got > 44 {
 		t.Fatalf("reloaded got=%v want ~43.3", got)
 	}
 }
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	}
 }
 func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
 	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
 	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
 	if len(cmd) != len(want) {
 		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
 	}
 	for i := range want {
 		if cmd[i] != want[i] {
 			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
 		}
 	}
 }
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
 			`<code>all_reduce_perf</code> (NCCL tests)`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
 			`<code>nvbandwidth</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
 		)) +
 		`</div>` +
 		`</div>
@@ -1527,8 +1527,6 @@ function satModeChanged() {
    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
@@ -1776,7 +1774,7 @@ function runAllSAT() {
  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -2082,7 +2080,7 @@ func renderBenchmark(opts HandlerOptions) string {
  </div>
 </div>
-`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
+` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2517,7 +2515,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 <div class="card" style="margin-bottom:16px">
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	}
 }
 func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA Interconnect (NCCL)`,
 		`Runs in Validate and Stress.`,
 		`NVIDIA Bandwidth (NVBandwidth)`,
 		`Intended to stay short enough for Validate.`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -736,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
 			DurationSec: dur,
 			Loader:      platform.NvidiaStressLoaderNCCL,
 			GPUIndices:  t.params.GPUIndices,
 		}, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -713,6 +713,19 @@ static const struct profile_desc k_profiles[] = {
 #define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
 static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
    if (!(desc->enabled && cc >= desc->min_cc)) {
        return 0;
    }
    if (precision_filter != NULL) {
        return strcmp(desc->block_label, precision_filter) == 0;
    }
    /* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
     * unstable on the current benchmark fleet and can abort the whole mixed
     * pass after earlier phases already collected useful telemetry. */
    return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
 }
 static int load_cublaslt(struct cublaslt_api *api) {
    memset(api, 0, sizeof(*api));
    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -1222,8 +1235,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    /* Count profiles matching the filter (for deciding what to run). */
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
            planned++;
        }
    }
@@ -1240,7 +1252,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
     * profiles matching precision_filter. */
    int planned_total = 0;
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned_total++;
        }
    }
@@ -1310,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
-        if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
+        if (!profile_allowed_for_run(desc, cc, precision_filter)) {
            append_detail(report->details,
                          sizeof(report->details),
-                          "%s=SKIPPED precision_filter\n",
+                          "%s=SKIPPED benchmark_disabled\n",
                          desc->name);
            continue;
        }
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -1,117 +0,0 @@
 #!/bin/sh
 # 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
 set -e
 echo "=== generating bee wallpaper ==="
 mkdir -p /usr/share/bee
 python3 - <<'PYEOF'
 from PIL import Image, ImageDraw, ImageFont, ImageFilter
 import os
 W, H = 1920, 1080
 ASCII_ART = [
    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
 ]
 SUBTITLE = "  Hardware Audit LiveCD"
 FG = (0xF6, 0xD0, 0x47)
 FG_DIM = (0xD4, 0xA9, 0x1C)
 SHADOW = (0x5E, 0x47, 0x05)
 SUB = (0x96, 0x7A, 0x17)
 BG = (0x05, 0x05, 0x05)
 MONO_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
 ]
 SUB_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
 ]
 def load_font(candidates, size):
    for path in candidates:
        if os.path.exists(path):
            return ImageFont.truetype(path, size)
    return ImageFont.load_default()
 def mono_metrics(font):
    probe = Image.new('L', (W, H), 0)
    draw = ImageDraw.Draw(probe)
    char_w = int(round(draw.textlength("M", font=font)))
    bb = draw.textbbox((0, 0), "Mg", font=font)
    char_h = bb[3] - bb[1]
    return char_w, char_h
 def render_ascii_mask(font, lines, char_w, char_h, line_gap):
    width = max(len(line) for line in lines) * char_w
    height = len(lines) * char_h + line_gap * (len(lines) - 1)
    mask = Image.new('L', (width, height), 0)
    draw = ImageDraw.Draw(mask)
    for row, line in enumerate(lines):
        y = row * (char_h + line_gap)
        for col, ch in enumerate(line):
            if ch == ' ':
                continue
            x = col * char_w
            draw.text((x, y), ch, font=font, fill=255)
    return mask
 img = Image.new('RGB', (W, H), BG)
 draw = ImageDraw.Draw(img)
 # Soft amber glow under the logo without depending on font rendering.
 glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
 glow_draw = ImageDraw.Draw(glow)
 glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
 glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
 glow = glow.filter(ImageFilter.GaussianBlur(60))
 img = Image.alpha_composite(img.convert('RGBA'), glow)
 TARGET_LOGO_W = 400
 max_chars = max(len(line) for line in ASCII_ART)
 _probe_font = load_font(MONO_FONT_CANDIDATES, 64)
 _probe_cw, _ = mono_metrics(_probe_font)
 font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
 font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
 char_w, char_h = mono_metrics(font_logo)
 logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
 logo_w, logo_h = logo_mask.size
 logo_x = (W - logo_w) // 2
 logo_y = 380
 sh_off = max(1, font_size_logo // 6)
 shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
 img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
 img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
 img.paste(FG, (logo_x, logo_y), logo_mask)
 font_sub = load_font(SUB_FONT_CANDIDATES, 30)
 sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
 sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
 sub_y = logo_y + logo_h + 48
 draw = ImageDraw.Draw(img)
 draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
 draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
 img = img.convert('RGB')
 img.save('/usr/share/bee/wallpaper.png', optimize=True)
 print('wallpaper written: /usr/share/bee/wallpaper.png')
 PYEOF
 echo "=== wallpaper done ==="
Author	SHA1	Message	Date
Mikhail Chusavitin	dca4afb8d0	Seed power ramp with single-card TDP limits	2026-04-16 11:43:01 +03:00
Mikhail Chusavitin	b4280941f5	Move NCCL and NVBandwidth into validate mode	2026-04-16 11:02:30 +03:00
Mikhail Chusavitin	f74976ec4c	Use static overlay wallpaper in ISO build	2026-04-16 10:54:03 +03:00
Mikhail Chusavitin	18e24a9aa5	Estimate fan duty from observed RPM maxima	2026-04-16 10:10:18 +03:00
Mikhail Chusavitin	e306250da7	Disable fp64/fp4 in mixed gpu burn	2026-04-16 10:00:03 +03:00
Mikhail Chusavitin	c5b2081ac9	Disable unstable fp4/fp64 benchmark phases	2026-04-16 09:58:02 +03:00
Michael Chus	434528083e	Power bench: compare GPU-reported TDP vs IPMI server power delta - NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower - RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via background goroutine throughout Phase 2 ramp - renderPowerBenchReport: new "Server vs GPU Power Comparison" table with ratio annotation (✓ match / ⚠ minor / ✗ over-report) - renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w, server_reporting_ratio keys Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 07:21:02 +03:00