Use SDR PSU AC input for per-step server power in power ramp

When sdrStep.PSUInW is available, prefer it over DCMI for
ramp.ServerLoadedW and ServerDeltaW. DCMI on this platform (MSI 4-PSU)
reports ~half actual draw; SDR sums all PSU_POWER_IN sensors correctly.

Delta is now SDR-to-SDR (sdrStep.PSUInW - sdrIdle.PSUInW) for
consistency. DCMI path kept as fallback when SDR has no PSU data.
Log message now indicates the source (SDR PSU AC input vs DCMI).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 18:43:36 +03:00
parent b89580c24d
commit 2cdf034bb0

View File

@@ -4346,7 +4346,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
if w, ok := <-ipmiStepDone; ok {
stepIPMILoadedW = w
stepIPMIOK = true
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
}
// Accumulate restore actions; they all run in the outer defer.
allRestoreActions = append(allRestoreActions, stepRestore...)
@@ -4410,9 +4409,27 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
}
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
// Per-step PSU slot snapshot — also used as the authoritative loaded power
// source when SDR PSU sensors are available (more accurate than DCMI on
// servers where DCMI covers only a subset of installed PSUs).
sdrStep := sampleIPMISDRPowerSensors()
if len(sdrStep.PSUSlots) > 0 {
ramp.PSUSlotReadings = sdrStep.PSUSlots
}
if sdrStep.PSUInW > 0 {
// SDR PSU sum is available — use it for server power (includes all PSUs).
ramp.ServerLoadedW = sdrStep.PSUInW
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW))
if step == len(result.RecommendedSlotOrder) {
serverLoadedW = sdrStep.PSUInW
serverLoadedOK = true
}
} else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
ramp.ServerLoadedW = stepIPMILoadedW
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW))
// The last step has all GPUs loaded — use it as the top-level loaded_w.
if step == len(result.RecommendedSlotOrder) {
serverLoadedW = stepIPMILoadedW
@@ -4420,12 +4437,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
}
// Per-step PSU slot snapshot.
sdrStep := sampleIPMISDRPowerSensors()
if len(sdrStep.PSUSlots) > 0 {
ramp.PSUSlotReadings = sdrStep.PSUSlots
}
// Fan state at end of ramp step.
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
ramp.AvgFanRPM = meanFanRPM(fans)