Refactor power ramp to use true single-card baselines

Phase 1 now calibrates each GPU individually (sequentially) so that
PowerRealizationPct reflects real degradation from neighbour thermals and
shared power rails. Previously the baseline came from an all-GPU-together
run, making realization always ≈100% at the final ramp step.

Ramp step 1 reuses single-card calibration results (no extra run); steps
2..N run targeted_power on the growing GPU subset with derating active.

Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper —
they were compensation for the old all-GPU calibration approach.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 23:47:57 +03:00
parent ee422ede3c
commit a5e0261ff2
2 changed files with 34 additions and 28 deletions

View File

@@ -2831,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
}
}
func occupiedSlots(indices []int, current int) []int {
out := make([]int, 0, len(indices))
for _, idx := range indices {
if idx != current {
out = append(out, idx)
}
}
return out
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src))
@@ -2887,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
b.WriteString("\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
if gpu.OccupiedSlotsNote != "" {
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
}
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
@@ -2955,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
var allRestoreActions []benchmarkRestoreAction
for _, idx := range selected {
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok {
calibByIndex[idx] = r
}
}
defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn()
for i := len(allRestoreActions) - 1; i >= 0; i-- {
allRestoreActions[i].fn()
}
}()
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
@@ -2975,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.OverallStatus = "PARTIAL"
}
}
occupied := occupiedSlots(selected, idx)
note := ""
if len(occupied) > 0 {
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
}
gpus = append(gpus, NvidiaPowerBenchGPU{
Index: idx,
Name: info.Name,
@@ -2991,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
CalibrationAttempts: calib.Attempts,
Derated: calib.Derated,
Status: status,
OccupiedSlots: occupied,
OccupiedSlotsNote: note,
Notes: append([]string(nil), calib.Notes...),
CoolingWarning: calib.CoolingWarning,
})
@@ -3032,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
for _, gpu := range gpus {
singleByIndex[gpu.Index] = gpu
}
// Phase 2: ramp — add one GPU per step and calibrate the growing subset
// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
// targeted_power with derating if degradation is detected.
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755)
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
for i := len(stepRestore) - 1; i >= 0; i-- {
stepRestore[i].fn()
var stepCalib map[int]benchmarkPowerCalibrationResult
if step == 1 {
// Single-GPU step — already measured in phase 1; reuse directly.
stepCalib = calibByIndex
logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
} else {
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
var stepRestore []benchmarkRestoreAction
stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
for i := len(stepRestore) - 1; i >= 0; i-- {
stepRestore[i].fn()
}
}
ramp := NvidiaPowerBenchStep{
StepIndex: step,

View File

@@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct {
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
Derated bool `json:"derated,omitempty"`
Status string `json:"status"`
OccupiedSlots []int `json:"occupied_slots,omitempty"`
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
Notes []string `json:"notes,omitempty"`
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
CoolingWarning string `json:"cooling_warning,omitempty"`