From a5e0261ff2afb43c0c7d93c2427536a9459b3f97 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Tue, 14 Apr 2026 23:47:57 +0300 Subject: [PATCH] Refactor power ramp to use true single-card baselines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 now calibrates each GPU individually (sequentially) so that PowerRealizationPct reflects real degradation from neighbour thermals and shared power rails. Previously the baseline came from an all-GPU-together run, making realization always ≈100% at the final ramp step. Ramp step 1 reuses single-card calibration results (no extra run); steps 2..N run targeted_power on the growing GPU subset with derating active. Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper — they were compensation for the old all-GPU calibration approach. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 60 ++++++++++++---------- audit/internal/platform/benchmark_types.go | 2 - 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 8c33be0..6f1295d 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -2831,15 +2831,6 @@ func powerBenchDurationSec(profile string) int { } } -func occupiedSlots(indices []int, current int) []int { - out := make([]int, 0, len(indices)) - for _, idx := range indices { - if idx != current { - out = append(out, idx) - } - } - return out -} func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo { out := make(map[int]benchmarkGPUInfo, len(src)) @@ -2887,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { b.WriteString("\n") for _, gpu := range result.GPUs { fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name) - if gpu.OccupiedSlotsNote != "" { - fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote) - } + for _, note := range gpu.Notes { fmt.Fprintf(&b, "- %s\n", note) } @@ -2955,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } durationSec := powerBenchDurationSec(opts.Profile) _ = durationSec - calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc) + // Phase 1: calibrate each GPU individually (sequentially, one at a time) to + // establish a true single-card power baseline unaffected by neighbour heat. + calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected)) + var allRestoreActions []benchmarkRestoreAction + for _, idx := range selected { + singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx)) + _ = os.MkdirAll(singleDir, 0755) + singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex) + logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) + c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc) + allRestoreActions = append(allRestoreActions, restore...) + if r, ok := c[idx]; ok { + calibByIndex[idx] = r + } + } defer func() { - for i := len(restoreActions) - 1; i >= 0; i-- { - restoreActions[i].fn() + for i := len(allRestoreActions) - 1; i >= 0; i-- { + allRestoreActions[i].fn() } }() gpus := make([]NvidiaPowerBenchGPU, 0, len(selected)) @@ -2975,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N result.OverallStatus = "PARTIAL" } } - occupied := occupiedSlots(selected, idx) - note := "" - if len(occupied) > 0 { - note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied)) - } gpus = append(gpus, NvidiaPowerBenchGPU{ Index: idx, Name: info.Name, @@ -2991,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N CalibrationAttempts: calib.Attempts, Derated: calib.Derated, Status: status, - OccupiedSlots: occupied, - OccupiedSlotsNote: note, Notes: append([]string(nil), calib.Notes...), CoolingWarning: calib.CoolingWarning, }) @@ -3032,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N for _, gpu := range gpus { singleByIndex[gpu.Index] = gpu } + + // Phase 2: ramp — add one GPU per step and calibrate the growing subset + // simultaneously. Step 1 reuses single-card results; steps 2..N run fresh + // targeted_power with derating if degradation is detected. for step := 1; step <= len(result.RecommendedSlotOrder); step++ { subset := append([]int(nil), result.RecommendedSlotOrder[:step]...) stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step)) _ = os.MkdirAll(stepDir, 0755) - stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) - stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc) - for i := len(stepRestore) - 1; i >= 0; i-- { - stepRestore[i].fn() + var stepCalib map[int]benchmarkPowerCalibrationResult + if step == 1 { + // Single-GPU step — already measured in phase 1; reuse directly. + stepCalib = calibByIndex + logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0])) + } else { + stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) + var stepRestore []benchmarkRestoreAction + stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc) + for i := len(stepRestore) - 1; i >= 0; i-- { + stepRestore[i].fn() + } } ramp := NvidiaPowerBenchStep{ StepIndex: step, diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index ea9330a..f09dea7 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct { CalibrationAttempts int `json:"calibration_attempts,omitempty"` Derated bool `json:"derated,omitempty"` Status string `json:"status"` - OccupiedSlots []int `json:"occupied_slots,omitempty"` - OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"` Notes []string `json:"notes,omitempty"` // CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow. CoolingWarning string `json:"cooling_warning,omitempty"`