From 2cdf034bb0abfcd9eee7bbdef4301ec2c8b6b8b8 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 19 Apr 2026 18:43:36 +0300 Subject: [PATCH] Use SDR PSU AC input for per-step server power in power ramp When sdrStep.PSUInW is available, prefer it over DCMI for ramp.ServerLoadedW and ServerDeltaW. DCMI on this platform (MSI 4-PSU) reports ~half actual draw; SDR sums all PSU_POWER_IN sensors correctly. Delta is now SDR-to-SDR (sdrStep.PSUInW - sdrIdle.PSUInW) for consistency. DCMI path kept as fallback when SDR has no PSU data. Log message now indicates the source (SDR PSU AC input vs DCMI). Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 7a1b991..630bc83 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -4346,7 +4346,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N if w, ok := <-ipmiStepDone; ok { stepIPMILoadedW = w stepIPMIOK = true - logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w)) } // Accumulate restore actions; they all run in the outer defer. allRestoreActions = append(allRestoreActions, stepRestore...) @@ -4410,9 +4409,27 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) } - if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 { + // Per-step PSU slot snapshot — also used as the authoritative loaded power + // source when SDR PSU sensors are available (more accurate than DCMI on + // servers where DCMI covers only a subset of installed PSUs). + sdrStep := sampleIPMISDRPowerSensors() + if len(sdrStep.PSUSlots) > 0 { + ramp.PSUSlotReadings = sdrStep.PSUSlots + } + + if sdrStep.PSUInW > 0 { + // SDR PSU sum is available — use it for server power (includes all PSUs). + ramp.ServerLoadedW = sdrStep.PSUInW + ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW + logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW)) + if step == len(result.RecommendedSlotOrder) { + serverLoadedW = sdrStep.PSUInW + serverLoadedOK = true + } + } else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 { ramp.ServerLoadedW = stepIPMILoadedW ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW + logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW)) // The last step has all GPUs loaded — use it as the top-level loaded_w. if step == len(result.RecommendedSlotOrder) { serverLoadedW = stepIPMILoadedW @@ -4420,12 +4437,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } } - // Per-step PSU slot snapshot. - sdrStep := sampleIPMISDRPowerSensors() - if len(sdrStep.PSUSlots) > 0 { - ramp.PSUSlotReadings = sdrStep.PSUSlots - } - // Fan state at end of ramp step. if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 { ramp.AvgFanRPM = meanFanRPM(fans)