audit: switch power benchmark load to dcgmproftester

This commit is contained in:
2026-04-20 06:57:14 +03:00
parent 65bcc9ce81
commit 17118298bd
3 changed files with 962 additions and 161 deletions

View File

@@ -240,6 +240,47 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
return nil
}
func benchmarkPowerEngine() string {
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
case BenchmarkPowerEngineTargetedPower:
return BenchmarkPowerEngineTargetedPower
default:
return BenchmarkPowerEngineDCGMProfTester
}
}
func benchmarkPowerEngineLabel(engine string) string {
switch strings.TrimSpace(strings.ToLower(engine)) {
case BenchmarkPowerEngineTargetedPower:
return "dcgmi diag targeted_power"
default:
return "dcgmproftester"
}
}
func resolveBenchmarkPowerLoadCommand(durationSec int, gpuIndices []int) ([]string, []string, error) {
engine := benchmarkPowerEngine()
durationSec = normalizeNvidiaBurnDuration(durationSec)
switch engine {
case BenchmarkPowerEngineTargetedPower:
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), nil, nil
default:
if len(gpuIndices) > 1 {
return []string{
"bee-dcgmproftester-staggered",
"--seconds", strconv.Itoa(durationSec),
"--stagger-seconds", "0",
"--devices", joinIndexList(gpuIndices),
}, nil, nil
}
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(durationSec))
if err != nil {
return nil, nil, err
}
return cmd, nvidiaVisibleDevicesEnv(gpuIndices), nil
}
}
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
@@ -384,10 +425,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
}
}
@@ -430,7 +471,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
serverPowerStopCh := make(chan struct{})
serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
close(serverPowerStopCh)
if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
serverLoadedWSum += benchmarkMean(serverPowerSamples)
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (%s avg): %.0f W", idx, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
}
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
@@ -461,48 +511,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
beforeThrottle, _ := queryThrottleCounters(idx)
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
afterThrottle, _ := queryThrottleCounters(idx)
if planErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
@@ -652,7 +660,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
if serverLoadedSamples > 0 {
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
}
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
result.Cooling = summarizeBenchmarkCooling(metricRows)
// Apply server-power penalty when IPMI reports the server delta is much
@@ -707,6 +715,7 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
if opts.SizeMB < 0 {
opts.SizeMB = 0
}
opts.ServerPowerSource = normalizeBenchmarkPowerSource(opts.ServerPowerSource)
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
return opts
@@ -2535,10 +2544,14 @@ loop:
}
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
// IPMI samples plus the GPU-reported average power during steady state.
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
sp := &BenchmarkServerPower{Available: ipmiAvailable}
if !ipmiAvailable {
// samples plus the GPU-reported average power during steady state.
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, source string, available bool) *BenchmarkServerPower {
sp := &BenchmarkServerPower{
Available: available,
Source: normalizeBenchmarkPowerSource(source),
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
}
if !available {
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
return sp
}
@@ -2671,10 +2684,10 @@ func runNvidiaBenchmarkParallel(
// Sample server idle power once.
if !*serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
*serverIdleW = w
*serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
}
}
@@ -2728,7 +2741,16 @@ func runNvidiaBenchmarkParallel(
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
serverPowerStopCh := make(chan struct{})
serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
close(serverPowerStopCh)
if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
*serverLoadedWSum += benchmarkMean(serverPowerSamples)
(*serverLoadedSamples)++
*serverLoadedOK = true
logFunc(fmt.Sprintf("GPUs %s: server loaded power (%s avg): %.0f W", allDevices, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
}
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
@@ -2770,46 +2792,6 @@ func runNvidiaBenchmarkParallel(
}
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
// Sample server power via IPMI in parallel with steady phase.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW
(*serverLoadedSamples)++
*serverLoadedOK = true
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
}
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
afterThrottle[idx], _ = queryThrottleCounters(idx)
@@ -3040,8 +3022,8 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
return cl
}
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
// actively watches throttle counters. seedLimits, when provided, are treated as
// runBenchmarkPowerCalibration runs the configured power-fit load for the supplied
// GPU set and actively watches throttle counters. seedLimits, when provided, are treated as
// the starting point for this calibration pass rather than as immutable fixed
// limits. This matters during cumulative ramp-up: once an additional GPU is
// introduced, every already-active GPU must be revalidated under the new
@@ -3070,10 +3052,19 @@ func runBenchmarkPowerCalibration(
// doubling each retry until it would exceed the cap, at which point the
// next busy response fails the calibration immediately.
const dcgmResourceBusyMaxDelaySec = 300
engine := benchmarkPowerEngine()
engineLabel := benchmarkPowerEngineLabel(engine)
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
if engine == BenchmarkPowerEngineTargetedPower {
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
}
} else {
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
}
}
if killed := KillTestWorkers(); len(killed) > 0 {
for _, p := range killed {
@@ -3206,7 +3197,7 @@ calibDone:
sharedAttempt++
for _, s := range active {
s.calib.Attempts++
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
logFunc(fmt.Sprintf("power calibration: GPU %d %s attempt %d at %d W for %ds", s.idx, engineLabel, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
}
// Snapshot throttle counters for all active GPUs before the run.
@@ -3215,14 +3206,22 @@ calibDone:
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
}
// Run targeted_power for ALL gpuIndices simultaneously so every card
// Run the selected power-fit load for ALL gpuIndices simultaneously so every card
// is under load during calibration — this reflects real server thermals.
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
cmd, env, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices)
if err != nil {
for _, s := range active {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("failed to resolve %s command: %v", engineLabel, err))
s.converged = true
}
logFunc(fmt.Sprintf("power calibration: failed to resolve %s command: %v", engineLabel, err))
break calibDone
}
attemptCtx, cancelAttempt := context.WithCancel(ctx)
doneCh := make(chan sharedAttemptResult, 1)
go func() {
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
}()
@@ -3245,8 +3244,8 @@ calibDone:
if err != nil {
continue
}
// Record throttle but do NOT cancel — let dcgmi finish so
// nv-hostengine releases the slot cleanly before the next attempt.
// Record throttle but do NOT cancel — let the load command finish so
// runtime resources release cleanly before the next attempt.
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
throttleReasons[s.idx] = reason
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
@@ -3359,9 +3358,9 @@ calibDone:
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
case ar.err != nil:
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
logFunc(fmt.Sprintf("power calibration: GPU %d %s failed at %d W: %v", s.idx, engineLabel, s.appliedLimitW, ar.err))
default:
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("%s attempt %d at %d W: no valid power telemetry", engineLabel, s.calib.Attempts, s.appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
}
@@ -3384,7 +3383,7 @@ calibDone:
s.calib.Completed = true
}
} else {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
}
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
s.converged = true
@@ -3399,7 +3398,7 @@ calibDone:
next = (s.lo + s.hi) / 2
}
if next < s.minLimitW {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
s.converged = true
continue
}
@@ -4117,13 +4116,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
durationSec := powerBenchDurationSec(opts.Profile)
// Sample IPMI idle power before any GPU load.
// Sample server idle power before any GPU load.
var serverIdleW float64
var serverIdleOK bool
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
}
sdrIdle := sampleIPMISDRPowerSensors()
psuBefore := psuStatusSnapshot()
@@ -4141,26 +4140,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
ipmiSingleDone := make(chan float64, 1)
go func() {
defer close(ipmiSingleDone)
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
ipmiSingleDone <- w
}
}()
singlePowerStopCh := make(chan struct{})
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
ipmiSingleCancel()
close(singlePowerStopCh)
sdrSingle := sampleIPMISDRPowerSensors()
if sdrSingle.PSUInW > 0 {
if samples := <-singlePowerCh; len(samples) > 0 {
singleIPMILoadedW[idx] = benchmarkMean(samples)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
singleIPMILoadedW[idx] = sdrSingle.PSUInW
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (SDR PSU AC input)", idx, sdrSingle.PSUInW))
} else if w, ok := <-ipmiSingleDone; ok {
singleIPMILoadedW[idx] = w
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (DCMI)", idx, w))
} else {
<-ipmiSingleDone // drain channel
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
}
allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok {
@@ -4234,11 +4225,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
}
if len(result.RecommendedSlotOrder) > 0 {
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card %s: %s.", benchmarkPowerEngineLabel(benchmarkPowerEngine()), joinIndexList(result.RecommendedSlotOrder)))
}
for _, gpu := range gpus {
if gpu.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete %s.", gpu.Index, gpu.AppliedPowerLimitW, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
}
if gpu.CoolingWarning != "" {
result.Findings = append(result.Findings, fmt.Sprintf(
@@ -4255,7 +4246,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// Phase 2: cumulative thermal ramp.
// Each step introduces one new GPU into an environment where all previously
// calibrated GPUs are already running at their fixed stable limits. The new
// GPU's stable TDP is searched via binary search (targeted_power) under real
// GPU's stable TDP is searched via binary search under real
// multi-GPU thermal load. Once found, its limit is fixed permanently for all
// subsequent steps. This ensures each GPU's limit reflects actual sustained
// power in the final full-system thermal state.
@@ -4294,7 +4285,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
if !firstCalib.Completed {
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
result.OverallStatus = "PARTIAL"
} else if firstCalib.Derated {
ramp.Status = "PARTIAL"
@@ -4340,21 +4331,15 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
ipmiStepDone := make(chan float64, 1)
go func() {
defer close(ipmiStepDone)
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
ipmiStepDone <- w
}
}()
stepPowerStopCh := make(chan struct{})
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
ipmiStepCancel()
close(stepPowerStopCh)
var stepIPMILoadedW float64
var stepIPMIOK bool
if w, ok := <-ipmiStepDone; ok {
stepIPMILoadedW = w
if samples := <-stepPowerCh; len(samples) > 0 {
stepIPMILoadedW = benchmarkMean(samples)
stepIPMIOK = true
}
// Accumulate restore actions; they all run in the outer defer.
@@ -4391,7 +4376,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes,
fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
fmt.Sprintf("GPU %d did not complete %s in ramp step %d; keeping previous stable limit %d W", idx, benchmarkPowerEngineLabel(benchmarkPowerEngine()), step, fallback))
result.OverallStatus = "PARTIAL"
continue
}
@@ -4427,24 +4412,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp.PSUSlotReadings = sdrStep.PSUSlots
}
if sdrStep.PSUInW > 0 {
// SDR PSU sum is available — use it for server power (includes all PSUs).
ramp.ServerLoadedW = sdrStep.PSUInW
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW))
if step == len(result.RecommendedSlotOrder) {
serverLoadedW = sdrStep.PSUInW
serverLoadedOK = true
sdrLastStep = sdrStep
}
} else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
ramp.ServerLoadedW = stepIPMILoadedW
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW))
logFunc(fmt.Sprintf("power ramp: step %d server loaded power (%s avg): %.0f W", step, opts.ServerPowerSource, stepIPMILoadedW))
// The last step has all GPUs loaded — use it as the top-level loaded_w.
if step == len(result.RecommendedSlotOrder) {
serverLoadedW = stepIPMILoadedW
serverLoadedOK = true
sdrLastStep = sdrStep
}
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
ramp.ServerLoadedW = sdrStep.PSUInW
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
if step == len(result.RecommendedSlotOrder) {
serverLoadedW = sdrStep.PSUInW
serverLoadedOK = true
sdrLastStep = sdrStep
}
}
@@ -4502,7 +4487,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
gpuActualSumW = result.PlatformMaxTDPW
}
_ = serverIdleOK // used implicitly via characterizeServerPower
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
if result.ServerPower != nil {