Use SDR PSU AC input for per-step server power in power ramp
When sdrStep.PSUInW is available, prefer it over DCMI for ramp.ServerLoadedW and ServerDeltaW. DCMI on this platform (MSI 4-PSU) reports ~half actual draw; SDR sums all PSU_POWER_IN sensors correctly. Delta is now SDR-to-SDR (sdrStep.PSUInW - sdrIdle.PSUInW) for consistency. DCMI path kept as fallback when SDR has no PSU data. Log message now indicates the source (SDR PSU AC input vs DCMI). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4346,7 +4346,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
if w, ok := <-ipmiStepDone; ok {
|
||||
stepIPMILoadedW = w
|
||||
stepIPMIOK = true
|
||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
|
||||
}
|
||||
// Accumulate restore actions; they all run in the outer defer.
|
||||
allRestoreActions = append(allRestoreActions, stepRestore...)
|
||||
@@ -4410,9 +4409,27 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||
}
|
||||
|
||||
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||
// Per-step PSU slot snapshot — also used as the authoritative loaded power
|
||||
// source when SDR PSU sensors are available (more accurate than DCMI on
|
||||
// servers where DCMI covers only a subset of installed PSUs).
|
||||
sdrStep := sampleIPMISDRPowerSensors()
|
||||
if len(sdrStep.PSUSlots) > 0 {
|
||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||
}
|
||||
|
||||
if sdrStep.PSUInW > 0 {
|
||||
// SDR PSU sum is available — use it for server power (includes all PSUs).
|
||||
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW))
|
||||
if step == len(result.RecommendedSlotOrder) {
|
||||
serverLoadedW = sdrStep.PSUInW
|
||||
serverLoadedOK = true
|
||||
}
|
||||
} else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||
ramp.ServerLoadedW = stepIPMILoadedW
|
||||
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW))
|
||||
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
||||
if step == len(result.RecommendedSlotOrder) {
|
||||
serverLoadedW = stepIPMILoadedW
|
||||
@@ -4420,12 +4437,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
}
|
||||
|
||||
// Per-step PSU slot snapshot.
|
||||
sdrStep := sampleIPMISDRPowerSensors()
|
||||
if len(sdrStep.PSUSlots) > 0 {
|
||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||
}
|
||||
|
||||
// Fan state at end of ramp step.
|
||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||
ramp.AvgFanRPM = meanFanRPM(fans)
|
||||
|
||||
Reference in New Issue
Block a user