Refactor power ramp to use true single-card baselines
Phase 1 now calibrates each GPU individually (sequentially) so that PowerRealizationPct reflects real degradation from neighbour thermals and shared power rails. Previously the baseline came from an all-GPU-together run, making realization always ≈100% at the final ramp step. Ramp step 1 reuses single-card calibration results (no extra run); steps 2..N run targeted_power on the growing GPU subset with derating active. Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper — they were compensation for the old all-GPU calibration approach. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2831,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
|
||||
}
|
||||
}
|
||||
|
||||
func occupiedSlots(indices []int, current int) []int {
|
||||
out := make([]int, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
if idx != current {
|
||||
out = append(out, idx)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
||||
out := make(map[int]benchmarkGPUInfo, len(src))
|
||||
@@ -2887,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
b.WriteString("\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||
if gpu.OccupiedSlotsNote != "" {
|
||||
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
|
||||
}
|
||||
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
@@ -2955,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
_ = durationSec
|
||||
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
|
||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||
var allRestoreActions []benchmarkRestoreAction
|
||||
for _, idx := range selected {
|
||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
|
||||
allRestoreActions = append(allRestoreActions, restore...)
|
||||
if r, ok := c[idx]; ok {
|
||||
calibByIndex[idx] = r
|
||||
}
|
||||
}
|
||||
defer func() {
|
||||
for i := len(restoreActions) - 1; i >= 0; i-- {
|
||||
restoreActions[i].fn()
|
||||
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
||||
allRestoreActions[i].fn()
|
||||
}
|
||||
}()
|
||||
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
|
||||
@@ -2975,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.OverallStatus = "PARTIAL"
|
||||
}
|
||||
}
|
||||
occupied := occupiedSlots(selected, idx)
|
||||
note := ""
|
||||
if len(occupied) > 0 {
|
||||
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
|
||||
}
|
||||
gpus = append(gpus, NvidiaPowerBenchGPU{
|
||||
Index: idx,
|
||||
Name: info.Name,
|
||||
@@ -2991,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
CalibrationAttempts: calib.Attempts,
|
||||
Derated: calib.Derated,
|
||||
Status: status,
|
||||
OccupiedSlots: occupied,
|
||||
OccupiedSlotsNote: note,
|
||||
Notes: append([]string(nil), calib.Notes...),
|
||||
CoolingWarning: calib.CoolingWarning,
|
||||
})
|
||||
@@ -3032,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
for _, gpu := range gpus {
|
||||
singleByIndex[gpu.Index] = gpu
|
||||
}
|
||||
|
||||
// Phase 2: ramp — add one GPU per step and calibrate the growing subset
|
||||
// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
|
||||
// targeted_power with derating if degradation is detected.
|
||||
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
|
||||
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
||||
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
||||
_ = os.MkdirAll(stepDir, 0755)
|
||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
||||
for i := len(stepRestore) - 1; i >= 0; i-- {
|
||||
stepRestore[i].fn()
|
||||
var stepCalib map[int]benchmarkPowerCalibrationResult
|
||||
if step == 1 {
|
||||
// Single-GPU step — already measured in phase 1; reuse directly.
|
||||
stepCalib = calibByIndex
|
||||
logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
|
||||
} else {
|
||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
var stepRestore []benchmarkRestoreAction
|
||||
stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
||||
for i := len(stepRestore) - 1; i >= 0; i-- {
|
||||
stepRestore[i].fn()
|
||||
}
|
||||
}
|
||||
ramp := NvidiaPowerBenchStep{
|
||||
StepIndex: step,
|
||||
|
||||
@@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct {
|
||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
OccupiedSlots []int `json:"occupied_slots,omitempty"`
|
||||
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
|
||||
Reference in New Issue
Block a user