Refactor power ramp to use true single-card baselines
Phase 1 now calibrates each GPU individually (sequentially) so that PowerRealizationPct reflects real degradation from neighbour thermals and shared power rails. Previously the baseline came from an all-GPU-together run, making realization always ≈100% at the final ramp step. Ramp step 1 reuses single-card calibration results (no extra run); steps 2..N run targeted_power on the growing GPU subset with derating active. Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper — they were compensation for the old all-GPU calibration approach. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2831,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func occupiedSlots(indices []int, current int) []int {
|
|
||||||
out := make([]int, 0, len(indices))
|
|
||||||
for _, idx := range indices {
|
|
||||||
if idx != current {
|
|
||||||
out = append(out, idx)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
||||||
out := make(map[int]benchmarkGPUInfo, len(src))
|
out := make(map[int]benchmarkGPUInfo, len(src))
|
||||||
@@ -2887,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||||
if gpu.OccupiedSlotsNote != "" {
|
|
||||||
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
|
|
||||||
}
|
|
||||||
for _, note := range gpu.Notes {
|
for _, note := range gpu.Notes {
|
||||||
fmt.Fprintf(&b, "- %s\n", note)
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
}
|
}
|
||||||
@@ -2955,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
_ = durationSec
|
_ = durationSec
|
||||||
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
|
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||||
|
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||||
|
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||||
|
var allRestoreActions []benchmarkRestoreAction
|
||||||
|
for _, idx := range selected {
|
||||||
|
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||||
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
|
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
|
||||||
|
allRestoreActions = append(allRestoreActions, restore...)
|
||||||
|
if r, ok := c[idx]; ok {
|
||||||
|
calibByIndex[idx] = r
|
||||||
|
}
|
||||||
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
for i := len(restoreActions) - 1; i >= 0; i-- {
|
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
||||||
restoreActions[i].fn()
|
allRestoreActions[i].fn()
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
|
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
|
||||||
@@ -2975,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
occupied := occupiedSlots(selected, idx)
|
|
||||||
note := ""
|
|
||||||
if len(occupied) > 0 {
|
|
||||||
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
|
|
||||||
}
|
|
||||||
gpus = append(gpus, NvidiaPowerBenchGPU{
|
gpus = append(gpus, NvidiaPowerBenchGPU{
|
||||||
Index: idx,
|
Index: idx,
|
||||||
Name: info.Name,
|
Name: info.Name,
|
||||||
@@ -2991,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
CalibrationAttempts: calib.Attempts,
|
CalibrationAttempts: calib.Attempts,
|
||||||
Derated: calib.Derated,
|
Derated: calib.Derated,
|
||||||
Status: status,
|
Status: status,
|
||||||
OccupiedSlots: occupied,
|
|
||||||
OccupiedSlotsNote: note,
|
|
||||||
Notes: append([]string(nil), calib.Notes...),
|
Notes: append([]string(nil), calib.Notes...),
|
||||||
CoolingWarning: calib.CoolingWarning,
|
CoolingWarning: calib.CoolingWarning,
|
||||||
})
|
})
|
||||||
@@ -3032,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
for _, gpu := range gpus {
|
for _, gpu := range gpus {
|
||||||
singleByIndex[gpu.Index] = gpu
|
singleByIndex[gpu.Index] = gpu
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 2: ramp — add one GPU per step and calibrate the growing subset
|
||||||
|
// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
|
||||||
|
// targeted_power with derating if degradation is detected.
|
||||||
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
|
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
|
||||||
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
||||||
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
||||||
_ = os.MkdirAll(stepDir, 0755)
|
_ = os.MkdirAll(stepDir, 0755)
|
||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
var stepCalib map[int]benchmarkPowerCalibrationResult
|
||||||
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
if step == 1 {
|
||||||
for i := len(stepRestore) - 1; i >= 0; i-- {
|
// Single-GPU step — already measured in phase 1; reuse directly.
|
||||||
stepRestore[i].fn()
|
stepCalib = calibByIndex
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
|
||||||
|
} else {
|
||||||
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
|
var stepRestore []benchmarkRestoreAction
|
||||||
|
stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
||||||
|
for i := len(stepRestore) - 1; i >= 0; i-- {
|
||||||
|
stepRestore[i].fn()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ramp := NvidiaPowerBenchStep{
|
ramp := NvidiaPowerBenchStep{
|
||||||
StepIndex: step,
|
StepIndex: step,
|
||||||
|
|||||||
@@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||||
Derated bool `json:"derated,omitempty"`
|
Derated bool `json:"derated,omitempty"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
OccupiedSlots []int `json:"occupied_slots,omitempty"`
|
|
||||||
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
|
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
|
|||||||
Reference in New Issue
Block a user