Refactor power ramp to use true single-card baselines

Phase 1 now calibrates each GPU individually (sequentially) so that PowerRealizationPct reflects real degradation from neighbour thermals and shared power rails. Previously the baseline came from an all-GPU-together run, making realization always ≈100% at the final ramp step. Ramp step 1 reuses single-card calibration results (no extra run); steps 2..N run targeted_power on the growing GPU subset with derating active. Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper — they were compensation for the old all-GPU calibration approach. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 23:47:57 +03:00
parent ee422ede3c
commit a5e0261ff2
2 changed files with 34 additions and 28 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -2831,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
 	}
 }
 func occupiedSlots(indices []int, current int) []int {
 	out := make([]int, 0, len(indices))
 	for _, idx := range indices {
 		if idx != current {
 			out = append(out, idx)
 		}
 	}
 	return out
 }
 func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
 	out := make(map[int]benchmarkGPUInfo, len(src))
@@ -2887,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	b.WriteString("\n")
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
-		if gpu.OccupiedSlotsNote != "" {
+
 			fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
 		}
 		for _, note := range gpu.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
@@ -2955,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)
 	_ = durationSec
-	calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
+	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
 	var allRestoreActions []benchmarkRestoreAction
 	for _, idx := range selected {
 		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
 		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
 		}
 	}
 	defer func() {
-		for i := len(restoreActions) - 1; i >= 0; i-- {
+		for i := len(allRestoreActions) - 1; i >= 0; i-- {
-			restoreActions[i].fn()
+			allRestoreActions[i].fn()
 		}
 	}()
 	gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
@@ -2975,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 				result.OverallStatus = "PARTIAL"
 			}
 		}
 		occupied := occupiedSlots(selected, idx)
 		note := ""
 		if len(occupied) > 0 {
 			note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
 		}
 		gpus = append(gpus, NvidiaPowerBenchGPU{
 			Index:               idx,
 			Name:                info.Name,
@@ -2991,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			CalibrationAttempts: calib.Attempts,
 			Derated:             calib.Derated,
 			Status:              status,
 			OccupiedSlots:       occupied,
 			OccupiedSlotsNote:   note,
 			Notes:               append([]string(nil), calib.Notes...),
 			CoolingWarning:      calib.CoolingWarning,
 		})
@@ -3032,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	for _, gpu := range gpus {
 		singleByIndex[gpu.Index] = gpu
 	}
 	// Phase 2: ramp — add one GPU per step and calibrate the growing subset
 	// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
 	// targeted_power with derating if degradation is detected.
 	for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
 		subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
 		stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
 		_ = os.MkdirAll(stepDir, 0755)
-		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
+		var stepCalib map[int]benchmarkPowerCalibrationResult
-		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
+		if step == 1 {
-		for i := len(stepRestore) - 1; i >= 0; i-- {
+			// Single-GPU step — already measured in phase 1; reuse directly.
-			stepRestore[i].fn()
+			stepCalib = calibByIndex
 			logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
 		} else {
 			stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 			var stepRestore []benchmarkRestoreAction
 			stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
 			for i := len(stepRestore) - 1; i >= 0; i-- {
 				stepRestore[i].fn()
 			}
 		}
 		ramp := NvidiaPowerBenchStep{
 			StepIndex:  step,
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct {
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
 	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`