Use SDR PSU AC input for per-step server power in power ramp
When sdrStep.PSUInW is available, prefer it over DCMI for ramp.ServerLoadedW and ServerDeltaW. DCMI on this platform (MSI 4-PSU) reports ~half actual draw; SDR sums all PSU_POWER_IN sensors correctly. Delta is now SDR-to-SDR (sdrStep.PSUInW - sdrIdle.PSUInW) for consistency. DCMI path kept as fallback when SDR has no PSU data. Log message now indicates the source (SDR PSU AC input vs DCMI). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4346,7 +4346,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
if w, ok := <-ipmiStepDone; ok {
|
if w, ok := <-ipmiStepDone; ok {
|
||||||
stepIPMILoadedW = w
|
stepIPMILoadedW = w
|
||||||
stepIPMIOK = true
|
stepIPMIOK = true
|
||||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
|
|
||||||
}
|
}
|
||||||
// Accumulate restore actions; they all run in the outer defer.
|
// Accumulate restore actions; they all run in the outer defer.
|
||||||
allRestoreActions = append(allRestoreActions, stepRestore...)
|
allRestoreActions = append(allRestoreActions, stepRestore...)
|
||||||
@@ -4410,9 +4409,27 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
// Per-step PSU slot snapshot — also used as the authoritative loaded power
|
||||||
|
// source when SDR PSU sensors are available (more accurate than DCMI on
|
||||||
|
// servers where DCMI covers only a subset of installed PSUs).
|
||||||
|
sdrStep := sampleIPMISDRPowerSensors()
|
||||||
|
if len(sdrStep.PSUSlots) > 0 {
|
||||||
|
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||||
|
}
|
||||||
|
|
||||||
|
if sdrStep.PSUInW > 0 {
|
||||||
|
// SDR PSU sum is available — use it for server power (includes all PSUs).
|
||||||
|
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||||
|
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW))
|
||||||
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
|
serverLoadedW = sdrStep.PSUInW
|
||||||
|
serverLoadedOK = true
|
||||||
|
}
|
||||||
|
} else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||||
ramp.ServerLoadedW = stepIPMILoadedW
|
ramp.ServerLoadedW = stepIPMILoadedW
|
||||||
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW))
|
||||||
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
||||||
if step == len(result.RecommendedSlotOrder) {
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
serverLoadedW = stepIPMILoadedW
|
serverLoadedW = stepIPMILoadedW
|
||||||
@@ -4420,12 +4437,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Per-step PSU slot snapshot.
|
|
||||||
sdrStep := sampleIPMISDRPowerSensors()
|
|
||||||
if len(sdrStep.PSUSlots) > 0 {
|
|
||||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fan state at end of ramp step.
|
// Fan state at end of ramp step.
|
||||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||||
ramp.AvgFanRPM = meanFanRPM(fans)
|
ramp.AvgFanRPM = meanFanRPM(fans)
|
||||||
|
|||||||
Reference in New Issue
Block a user