@@ -67,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
MetricRows [ ] GPUMetricRow
}
type benchmarkPowerCalibrationRunSummary struct {
LoadedSDR benchmarkSDRSeriesSummary
AvgFanRPM float64
AvgFanDutyCyclePct float64
FanSamples int
}
type benchmarkBurnProfile struct {
name string
category string
@@ -2413,6 +2420,16 @@ type sdrPowerSnapshot struct {
SkippedSensors [ ] string // sensors rejected during self-healing
}
type benchmarkSDRSeriesSummary struct {
PSUInW float64
PSUOutW float64
GPUSlotW float64
PSUSlots map [ string ] BenchmarkPSUSlotPower
Samples int
SkippedSensors [ ] string
}
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
type sdrSensor struct {
name string
@@ -2542,6 +2559,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
return snap
}
func startIPMISDRSampler ( stopCh <- chan struct { } , intervalSec int ) <- chan [ ] sdrPowerSnapshot {
if intervalSec <= 0 {
intervalSec = benchmarkPowerAutotuneSampleInterval
}
ch := make ( chan [ ] sdrPowerSnapshot , 1 )
go func ( ) {
defer close ( ch )
var samples [ ] sdrPowerSnapshot
record := func ( ) {
snap := sampleIPMISDRPowerSensors ( )
if snap . PSUInW <= 0 && snap . PSUOutW <= 0 && snap . GPUSlotW <= 0 && len ( snap . PSUSlots ) == 0 {
return
}
samples = append ( samples , snap )
}
record ( )
ticker := time . NewTicker ( time . Duration ( intervalSec ) * time . Second )
defer ticker . Stop ( )
for {
select {
case <- stopCh :
ch <- samples
return
case <- ticker . C :
record ( )
}
}
} ( )
return ch
}
func summarizeSDRPowerSeries ( samples [ ] sdrPowerSnapshot ) benchmarkSDRSeriesSummary {
var summary benchmarkSDRSeriesSummary
if len ( samples ) == 0 {
return summary
}
type slotAggregate struct {
inputs [ ] float64
outputs [ ] float64
status string
}
slotAgg := make ( map [ string ] * slotAggregate )
skippedSet := make ( map [ string ] struct { } )
var inputTotals [ ] float64
var outputTotals [ ] float64
var gpuSlotTotals [ ] float64
for _ , sample := range samples {
if sample . PSUInW > 0 {
inputTotals = append ( inputTotals , sample . PSUInW )
}
if sample . PSUOutW > 0 {
outputTotals = append ( outputTotals , sample . PSUOutW )
}
if sample . GPUSlotW > 0 {
gpuSlotTotals = append ( gpuSlotTotals , sample . GPUSlotW )
}
for _ , skipped := range sample . SkippedSensors {
if skipped != "" {
skippedSet [ skipped ] = struct { } { }
}
}
for slot , reading := range sample . PSUSlots {
agg := slotAgg [ slot ]
if agg == nil {
agg = & slotAggregate { }
slotAgg [ slot ] = agg
}
if reading . InputW != nil && * reading . InputW > 0 {
agg . inputs = append ( agg . inputs , * reading . InputW )
}
if reading . OutputW != nil && * reading . OutputW > 0 {
agg . outputs = append ( agg . outputs , * reading . OutputW )
}
switch {
case reading . Status == "" :
case agg . status == "" :
agg . status = reading . Status
case agg . status == "OK" && reading . Status != "OK" :
agg . status = reading . Status
}
}
}
summary . PSUInW = benchmarkMean ( inputTotals )
summary . PSUOutW = benchmarkMean ( outputTotals )
summary . GPUSlotW = benchmarkMean ( gpuSlotTotals )
summary . Samples = len ( samples )
if len ( slotAgg ) > 0 {
summary . PSUSlots = make ( map [ string ] BenchmarkPSUSlotPower , len ( slotAgg ) )
for slot , agg := range slotAgg {
reading := BenchmarkPSUSlotPower { Status : agg . status }
if mean := benchmarkMean ( agg . inputs ) ; mean > 0 {
v := mean
reading . InputW = & v
}
if mean := benchmarkMean ( agg . outputs ) ; mean > 0 {
v := mean
reading . OutputW = & v
}
summary . PSUSlots [ slot ] = reading
}
}
if len ( skippedSet ) > 0 {
summary . SkippedSensors = make ( [ ] string , 0 , len ( skippedSet ) )
for skipped := range skippedSet {
summary . SkippedSensors = append ( summary . SkippedSensors , skipped )
}
sort . Strings ( summary . SkippedSensors )
}
return summary
}
func collectIPMISDRPowerSeries ( ctx context . Context , durationSec , intervalSec int ) benchmarkSDRSeriesSummary {
if durationSec <= 0 {
return benchmarkSDRSeriesSummary { }
}
stopCh := make ( chan struct { } )
doneCh := startIPMISDRSampler ( stopCh , intervalSec )
select {
case <- ctx . Done ( ) :
case <- time . After ( time . Duration ( durationSec ) * time . Second ) :
}
close ( stopCh )
return summarizeSDRPowerSeries ( <- doneCh )
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW ( ) ( float64 , error ) {
@@ -3086,8 +3234,9 @@ func runBenchmarkPowerCalibration(
logFunc func ( string ) ,
seedLimits map [ int ] int ,
durationSec int ,
) ( map [ int ] benchmarkPowerCalibrationResult , [ ] benchmarkRestoreAction , [ ] GPUMetricRow ) {
) ( map [ int ] benchmarkPowerCalibrationResult , [ ] benchmarkRestoreAction , [ ] GPUMetricRow , benchmarkPowerCalibrationRunSummary ) {
calibDurationSec := durationSec
var runSummary benchmarkPowerCalibrationRunSummary
if calibDurationSec <= 0 {
calibDurationSec = 120
}
@@ -3105,12 +3254,12 @@ func runBenchmarkPowerCalibration(
if engine == BenchmarkPowerEngineTargetedPower {
if _ , err := exec . LookPath ( "dcgmi" ) ; err != nil {
logFunc ( "power calibration: dcgmi not found, skipping (will use default power limit)" )
return map [ int ] benchmarkPowerCalibrationResult { } , nil , nil
return map [ int ] benchmarkPowerCalibrationResult { } , nil , nil , runSummary
}
} else {
if _ , _ , err := resolveBenchmarkPowerLoadCommand ( calibDurationSec , gpuIndices ) ; err != nil {
logFunc ( "power calibration: dcgmproftester not found, skipping (will use default power limit)" )
return map [ int ] benchmarkPowerCalibrationResult { } , nil , nil
return map [ int ] benchmarkPowerCalibrationResult { } , nil , nil , runSummary
}
}
if killed := KillTestWorkers ( ) ; len ( killed ) > 0 {
@@ -3275,6 +3424,10 @@ calibDone:
}
attemptCtx , cancelAttempt := context . WithCancel ( ctx )
doneCh := make ( chan sharedAttemptResult , 1 )
sdrStopCh := make ( chan struct { } )
sdrDoneCh := startIPMISDRSampler ( sdrStopCh , benchmarkPowerAutotuneSampleInterval )
fanStopCh := make ( chan struct { } )
fanDoneCh := startBenchmarkFanSampler ( fanStopCh , benchmarkPowerAutotuneSampleInterval )
go func ( ) {
out , rows , err := runBenchmarkCommandWithMetrics ( attemptCtx , verboseLog , logName , cmd , env , gpuIndices , logFunc )
doneCh <- sharedAttemptResult { out : out , rows : rows , err : err }
@@ -3314,6 +3467,10 @@ calibDone:
}
ticker . Stop ( )
cancelAttempt ( )
close ( sdrStopCh )
close ( fanStopCh )
attemptSDRSummary := summarizeSDRPowerSeries ( <- sdrDoneCh )
attemptFanSummary := <- fanDoneCh
_ = os . WriteFile ( filepath . Join ( runDir , logName ) , ar . out , 0644 )
// Accumulate telemetry rows with attempt stage label.
appendBenchmarkMetrics ( & allCalibRows , ar . rows , fmt . Sprintf ( "attempt-%d" , sharedAttempt ) , & calibCursor , float64 ( calibDurationSec ) )
@@ -3351,10 +3508,14 @@ calibDone:
busyDelaySec = 1
// Per-GPU analysis and binary search update.
attemptStable := ar . err == nil
for _ , s := range active {
perGPU := filterRowsByGPU ( ar . rows , s . idx )
summary := summarizeBenchmarkTelemetry ( perGPU )
throttle := throttleReasons [ s . idx ]
if throttle != "" || summary . P95PowerW <= 0 {
attemptStable = false
}
// Cooling warning: thermal throttle with fans not at maximum.
if strings . Contains ( throttle , "thermal" ) && s . calib . CoolingWarning == "" {
@@ -3487,6 +3648,16 @@ calibDone:
s . calib . Notes = append ( s . calib . Notes , fmt . Sprintf ( "binary search: trying %d W (lo=%d hi=%d)" , next , s . lo , s . hi ) )
logFunc ( fmt . Sprintf ( "power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)" , s . idx , next , s . lo , s . hi ) )
}
if attemptStable {
if attemptSDRSummary . Samples > 0 {
runSummary . LoadedSDR = attemptSDRSummary
}
if attemptFanSummary . FanSamples > 0 {
runSummary . AvgFanRPM = attemptFanSummary . AvgFanRPM
runSummary . AvgFanDutyCyclePct = attemptFanSummary . AvgFanDutyCyclePct
runSummary . FanSamples = attemptFanSummary . FanSamples
}
}
}
for _ , s := range states {
@@ -3495,7 +3666,7 @@ calibDone:
}
}
writeBenchmarkMetricsFiles ( runDir , allCalibRows )
return results , restore , allCalibRows
return results , restore , allCalibRows , runSummary
}
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3540,6 +3711,47 @@ func meanFanRPM(fans []FanReading) float64 {
return sum / float64 ( len ( fans ) )
}
func startBenchmarkFanSampler ( stopCh <- chan struct { } , intervalSec int ) <- chan benchmarkPowerCalibrationRunSummary {
if intervalSec <= 0 {
intervalSec = benchmarkPowerAutotuneSampleInterval
}
ch := make ( chan benchmarkPowerCalibrationRunSummary , 1 )
go func ( ) {
defer close ( ch )
var rpmSamples [ ] float64
var dutySamples [ ] float64
record := func ( ) {
fans , err := sampleFanSpeeds ( )
if err != nil || len ( fans ) == 0 {
return
}
if rpm := meanFanRPM ( fans ) ; rpm > 0 {
rpmSamples = append ( rpmSamples , rpm )
}
if duty , ok , _ := sampleFanDutyCyclePctFromFans ( fans ) ; ok && duty > 0 {
dutySamples = append ( dutySamples , duty )
}
}
record ( )
ticker := time . NewTicker ( time . Duration ( intervalSec ) * time . Second )
defer ticker . Stop ( )
for {
select {
case <- stopCh :
ch <- benchmarkPowerCalibrationRunSummary {
AvgFanRPM : benchmarkMean ( rpmSamples ) ,
AvgFanDutyCyclePct : benchmarkMean ( dutySamples ) ,
FanSamples : len ( rpmSamples ) ,
}
return
case <- ticker . C :
record ( )
}
}
} ( )
return ch
}
func powerBenchDurationSec ( profile string ) int {
switch strings . TrimSpace ( strings . ToLower ( profile ) ) {
case NvidiaBenchmarkProfileStability :
@@ -3568,41 +3780,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fmt . Fprintf ( & b , "**Overall status:** %s \n" , result . OverallStatus )
fmt . Fprintf ( & b , "**Platform max TDP (GPU-reported):** %.0f W \n" , result . PlatformMaxTDPW )
if sp := result . ServerPower ; sp != nil && sp . Available {
fmt . Fprintf ( & b , "**Server power delta (IPMI DCMI):** %.0f W \n" , sp . DeltaW )
if sp . PSUInputLoadedW > 0 {
psuDelta := sp . PSUInputLoadedW - sp . PSUInputIdleW
fmt . Fprintf ( & b , "**PSU AC input Δ (IPMI SDR):** %.0f W \n" , psuDelta )
sourceLabel := "autotuned source"
switch normalizeBenchmarkPowerSource ( sp . Source ) {
case BenchmarkPowerSourceSDR PSUInput:
sourceLabel = "autotuned source (SDR PSU AC input)"
case BenchmarkPowerSourceDCMI :
sourceLabel = "autotuned source (DCMI)"
}
fmt . Fprintf ( & b , "**Reporting ratio (IPMI Δ / GPU actual sum ):** %.2 f \n" , sp . ReportingRatio )
fmt . Fprintf ( & b , "**Server power delta (%s ):** %.0 f W \n" , sourceLabel , sp . DeltaW )
fmt . Fprintf ( & b , "**Reporting ratio:** %.2f \n" , sp . ReportingRatio )
}
b . WriteString ( "\n" )
// Server power comparison table.
if sp := result . ServerPower ; sp != nil {
b . WriteString ( "## Server vs GPU Power Comparison\n\n" )
selectedSource := normalizeBenchmarkPowerSource ( sp . Source )
selectedSourceLabel := "Selected source"
if selectedSource == BenchmarkPowerSourceSDRPSUInput {
selectedSourceLabel = "Selected source (SDR PSU AC input)"
} else if selectedSource == BenchmarkPowerSourceDCMI {
selectedSourceLabel = "Selected source (DCMI)"
}
var spRows [ ] [ ] string
spRows = append ( spRows , [ ] string { "GPU stable limits sum" , "nvidia-smi " , fmt . Sprintf ( "%.0f W" , result . PlatformMaxTDP W) } )
spRows = append ( spRows , [ ] string { "GPU actual power sum (p95, last step)" , "nvidia-smi" , fmt . Sprintf ( "%.0f W" , sp . GPUReportedSumW ) } )
if sp . GPUSlotTotalW > 0 {
spRows = append ( spRows , [ ] string { "GPU PCIe slot power (at peak load)" , "IPMI SDR" , fmt . Sprintf ( "%.0f W" , sp . GPUSlotTotalW ) } )
}
spRows = append ( spRows , [ ] string { "GPU actual power sum (p95, last step) " , fmt . Sprintf ( "%.0f W" , sp . GPUReportedSum W) } )
if sp . Available {
spRows = append ( spRows , [ ] string { "Server idle power" , "IPMI DCMI ", fmt . Sprintf ( "%.0f W" , sp . IdleW ) } )
spRows = append ( spRows , [ ] string { "Server loaded power" , "IPMI DCMI ", fmt . Sprintf ( "%.0f W" , sp . LoadedW ) } )
spRows = append ( spRows , [ ] string { "Server Δ power (loaded − idle)" , "IPMI DCMI" , fmt . Sprintf ( "%.0f W" , sp . DeltaW ) } )
spRows = append ( spRows , [ ] string { selectedSourceLabel + " idle power ", fmt . Sprintf ( "%.0f W" , sp . IdleW ) } )
spRows = append ( spRows , [ ] string { selectedSourceLabel + " loaded power ", fmt . Sprintf ( "%.0f W" , sp . LoadedW ) } )
spRows = append ( spRows , [ ] string { selectedSourceLabel + " Δ power (loaded − idle)" , fmt . Sprintf ( "%.0f W" , sp . DeltaW ) } )
}
if sp . PSUInputLoadedW > 0 {
spRows = append ( spRows , [ ] string { "PSU AC input (idle)" , "IPMI SDR " , fmt . Sprintf ( "%.0f W" , sp . PSUInputIdleW ) } )
spRows = append ( spRows , [ ] string { "PSU AC input (loaded)" , "IPMI SDR " , fmt . Sprintf ( "%.0f W" , sp . PSUInputLoadedW ) } )
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp. PSUInputLoadedW > 0 {
spRows = append ( spRows , [ ] string { "PSU AC input (idle avg, pre-load phase) " , fmt . Sprintf ( "%.0f W" , sp . PSUInputIdleW ) } )
spRows = append ( spRows , [ ] string { "PSU AC input (loaded avg, final phase) " , fmt . Sprintf ( "%.0f W" , sp . PSUInputLoadedW ) } )
psuDelta := sp . PSUInputLoadedW - sp . PSUInputIdleW
spRows = append ( spRows , [ ] string { "PSU AC input Δ (loaded − idle)" , "IPMI SDR" , fmt . Sprintf ( "%.0f W" , psuDelta ) } )
}
if sp . PSUOutputLoadedW > 0 {
spRows = append ( spRows , [ ] string { "PSU DC output (idle)" , "IPMI SDR" , fmt . Sprintf ( "%.0f W" , sp . PSUOutputIdleW ) } )
spRows = append ( spRows , [ ] string { "PSU DC output (loaded)" , "IPMI SDR" , fmt . Sprintf ( "%.0f W" , sp . PSUOutputLoadedW ) } )
if sp . PSUInputLoadedW > 0 && sp . PSUInputIdleW > 0 {
psuEff := sp . PSUOutputIdleW / sp . PSUInputIdleW * 100
spRows = append ( spRows , [ ] string { "PSU conversion efficiency (idle)" , "IPMI SDR" , fmt . Sprintf ( "%.1f%%" , psuEff ) } )
}
spRows = append ( spRows , [ ] string { "PSU AC input Δ (loaded − idle)" , fmt . Sprintf ( "%.0f W" , psuDelta ) } )
}
if sp . Available {
ratio := sp . ReportingRatio
@@ -3619,8 +3829,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default :
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
spRows = append ( spRows , [ ] string { "Reporting ratio (DCMI Δ / GPU actual)" , "IPMI DCMI " , fmt . Sprintf ( "%.2f — %s" , ratio , ratioNote ) } )
if sp . PSUInputLoadedW > 0 && sp . GPUReportedSumW > 0 {
spRows = append ( spRows , [ ] string { "Reporting ratio" , fmt . Sprintf ( "%.2f — %s" , ratio , ratioNote ) } )
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp. PSUInputLoadedW > 0 && sp . GPUReportedSumW > 0 {
psuDelta := sp . PSUInputLoadedW - sp . PSUInputIdleW
sdrRatio := psuDelta / sp . GPUReportedSumW
sdrNote := ""
@@ -3632,12 +3842,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default :
sdrNote = "✗ significant discrepancy"
}
spRows = append ( spRows , [ ] string { "Reporting ratio (SDR PSU Δ / GPU actual)" , "IPMI SDR " , fmt . Sprintf ( "%.2f — %s" , sdrRatio , sdrNote ) } )
spRows = append ( spRows , [ ] string { "PSU AC input reporting ratio " , fmt . Sprintf ( "%.2f — %s" , sdrRatio , sdrNote ) } )
}
} else {
spRows = append ( spRows , [ ] string { "IPMI availability" , "—" , " not available — IPMI not supported or ipmitool not found" } )
spRows = append ( spRows , [ ] string { "IPMI availability" , "not available — IPMI not supported or ipmitool not found" } )
}
b . WriteString ( fmtMDTable ( [ ] string { "Metric" , "Source" , " Value" } , spRows ) )
b . WriteString ( fmtMDTable ( [ ] string { "Metric" , "Value" } , spRows ) )
for _ , note := range sp . Notes {
fmt . Fprintf ( & b , "\n> %s\n" , note )
}
@@ -3689,11 +3899,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
psuDistRows = append ( psuDistRows , [ ] string {
slot ,
fmtW ( idle . InputW ) , fmtW ( loaded . InputW ) ,
fmtW ( idle . OutputW ) , fmtW ( loaded . OutputW ) ,
deltaStr , status ,
} )
}
b . WriteString ( fmtMDTable ( [ ] string { "Slot" , "AC Input (idle)" , "AC Input (loaded)" , "DC Output (idle)" , "DC Output (loaded )" , "Load Δ" , "Status" } , psuDistRows ) )
b . WriteString ( fmtMDTable ( [ ] string { "Slot" , "AC Input (idle avg )" , "AC Input (loaded avg )" , "Load Δ" , "Status" } , psuDistRows ) )
b . WriteString ( "\n" )
}
}
@@ -3741,7 +3950,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fan ,
} )
}
b . WriteString ( fmtMDTable ( [ ] string { "GPU" , "Clock MHz (Mem MHz)" , "Avg Temp °C" , "Power W" , "Server Δ W" , "Fan RPM (duty%)" } , sgRows ) )
b . WriteString ( fmtMDTable ( [ ] string { "GPU" , "Clock MHz (Mem MHz)" , "Avg Temp °C" , "Power W" , "Server Δ W" , "Avg Fan RPM (duty%)" } , sgRows ) )
b . WriteString ( "\n" )
}
if len ( result . RecommendedSlotOrder ) > 0 {
@@ -3850,7 +4059,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
for _ , slot := range psuSlots {
psuHeaders = append ( psuHeaders , fmt . Sprintf ( "PSU %s W" , slot ) )
}
psuHeaders = append ( psuHeaders , "PSU Total W" , "Platform eff." , "Fan RPM (duty%)" )
psuHeaders = append ( psuHeaders , "PSU Total W" , "Platform eff." , "Avg Fan RPM (duty%)" )
var psuRows [ ] [ ] string
for _ , step := range result . RampSteps {
@@ -3931,7 +4140,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
pdRows = append ( pdRows , [ ] string {
fmt . Sprintf ( "GPU %d" , gpu . Index ) ,
fmt . Sprintf ( "%.0f W" , gpu . DefaultPowerLimitW ) ,
fmt . Sprintf ( "%.0f W" , gpu . AppliedPowerLimitW ) ,
fmt . Sprintf ( "%.0f W" , stable ) ,
realization ,
@@ -3944,13 +4152,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
pdRows = append ( pdRows , [ ] string {
"**Platform**" ,
fmt . Sprintf ( "**%.0f W**" , totalDefault ) ,
"—" ,
fmt . Sprintf ( "**%.0f W**" , totalStable ) ,
fmt . Sprintf ( "**%s**" , platformReal ) ,
"" ,
} )
b . WriteString ( fmtMDTable ( [ ] string { "GPU" , "Default TDP" , " Single-card limit" , "Stable limit" , "Realization" , "Derated" } , pdRows ) )
b . WriteString ( fmtMDTable ( [ ] string { "GPU" , "Single-card limit" , "Stable limit" , "Realization" , "Derated" } , pdRows ) )
b . WriteString ( "\n" )
// Balance across GPUs — only meaningful with 2+ GPUs.
@@ -4100,7 +4307,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
{ "Avg Temp °C" , singleTemp } ,
{ "Power W" , singlePwr } ,
{ "Per GPU wall W" , singleWall } ,
{ "Fan RPM (duty%)" , singleFan } ,
{ "Avg Fan RPM (duty%)" , singleFan } ,
}
if lastStep != nil {
compRows [ 0 ] = append ( compRows [ 0 ] , fmt . Sprintf ( "%s (%s)" , allClk , allMem ) )
@@ -4208,18 +4415,22 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// Sample server idle power before any GPU load.
var serverIdleW float64
var serverIdleOK bool
idleSDRStopCh := make ( chan struct { } )
idleSDRCh := startIPMISDRSampler ( idleSDRStopCh , benchmarkPowerAutotuneSampleInterval )
if w , ok := sampleBenchmarkPowerSourceSeries ( ctx , opts . ServerPowerSource , 10 , benchmarkPowerAutotuneSampleInterval ) ; ok {
serverIdleW = w
serverIdleOK = true
logFunc ( fmt . Sprintf ( "server idle power (%s): %.0f W" , opts . ServerPowerSource , w ) )
}
sdrIdle := sampleIPMISDRPowerSensors ( )
close ( idleSDRStopCh )
sdrIdle := summarizeSDRPowerSeries ( <- idleSDRCh )
psuBefore := psuStatusSnapshot ( )
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make ( map [ int ] benchmarkPowerCalibrationResult , len ( selected ) )
singleIPMILoadedW := make ( map [ int ] float64 , len ( selected ) )
singleRunSummaryByIndex := make ( map [ int ] benchmarkPowerCalibrationRunSummary , len ( selected ) )
var allRestoreActions [ ] benchmarkRestoreAction
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
var allPowerRows [ ] GPUMetricRow
@@ -4235,21 +4446,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
logFunc ( fmt . Sprintf ( "power calibration: GPU %d single-card baseline" , idx ) )
singlePowerStopCh := make ( chan struct { } )
singlePowerCh := startSelectedPowerSourceSampler ( singlePowerStopCh , opts . ServerPowerSource , benchmarkPowerAutotuneSampleInterval )
c , restore , singleRows := runBenchmarkPowerCalibration ( ctx , verboseLog , singleDir , [ ] int { idx } , singleInfo , logFunc , nil , durationSec )
c , restore , singleRows , singleRun := runBenchmarkPowerCalibration ( ctx , verboseLog , singleDir , [ ] int { idx } , singleInfo , logFunc , nil , durationSec )
appendBenchmarkMetrics ( & allPowerRows , singleRows , fmt . Sprintf ( "single-gpu-%d" , idx ) , & powerCursor , 0 )
close ( singlePowerStopCh )
sdrSingle := sampleIPMISDRPowerSensors ( )
if samples := <- singlePowerCh ; len ( samples ) > 0 {
singleIPMILoadedW [ idx ] = benchmarkMean ( samples )
logFunc ( fmt . Sprintf ( "power calibration: GPU %d single-card server power (%s avg): %.0f W" , idx , opts . ServerPowerSource , singleIPMILoadedW [ idx ] ) )
} else if opts . ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrS ingle . PSUInW > 0 {
singleIPMILoadedW [ idx ] = sdrS ingle . PSUInW
logFunc ( fmt . Sprintf ( "power calibration: GPU %d single-card fallback server power (SDR snapshot ): %.0f W" , idx , sdrS ingle . PSUInW ) )
} else if opts . ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun . LoadedSDR . PSUInW > 0 {
singleIPMILoadedW [ idx ] = singleRun . LoadedSDR . PSUInW
logFunc ( fmt . Sprintf ( "power calibration: GPU %d single-card fallback server power (SDR avg ): %.0f W" , idx , singleRun . LoadedSDR . PSUInW ) )
}
allRestoreActions = append ( allRestoreActions , restore ... )
if r , ok := c [ idx ] ; ok {
calibByIndex [ idx ] = r
}
singleRunSummaryByIndex [ idx ] = singleRun
}
defer func ( ) {
for i := len ( allRestoreActions ) - 1 ; i >= 0 ; i -- {
@@ -4292,11 +4503,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
t := summarizeBenchmarkTelemetry ( calib . MetricRows )
gpu . Telemetry = & t
}
if fans , err := sampleFanSpeeds ( ) ; err == nil && le n( fans ) > 0 {
gpu . AvgFanRPM = meanFanRPM ( fans )
if duty , ok , _ := sampleFanDutyCyclePctFromFans ( fans ) ; ok {
gpu . AvgFanDutyCyclePct = duty
}
if singleRun := singleRunSummaryByIndex [ idx ] ; singleRu n. AvgFanRPM > 0 {
gpu . AvgFanRPM = singleRun . AvgFanRPM
gpu . AvgFanDutyCyclePct = singleRun . AvgFanDutyCyclePct
}
gpus = append ( gpus , gpu )
}
@@ -4352,10 +4561,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
var serverLoadedW float64
var serverLoadedOK bool
// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
// after the test when GPUs have already returned to idle.
var sdrLastStep sdrPowerSnapshot
// sdrLastStep retains the phase-averaged SDR readings from the last ramp step
// while GPUs are loaded. Used in the summary instead of re-sampling after the
// test when GPUs have already returned to idle.
var sdrLastStep benchmarkSDRSeriesSummary
// Step 1: reuse single-card calibration result directly.
if len ( result . RecommendedSlotOrder ) > 0 {
@@ -4376,6 +4585,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp . ServerLoadedW = w
ramp . ServerDeltaW = w - serverIdleW
}
if singleRun := singleRunSummaryByIndex [ firstIdx ] ; singleRun . AvgFanRPM > 0 {
ramp . AvgFanRPM = singleRun . AvgFanRPM
ramp . AvgFanDutyCyclePct = singleRun . AvgFanDutyCyclePct
}
if ! firstCalib . Completed {
ramp . Status = "FAILED"
ramp . Notes = append ( ramp . Notes , fmt . Sprintf ( "GPU %d did not complete single-card %s" , firstIdx , benchmarkPowerEngineLabel ( benchmarkPowerEngine ( ) ) ) )
@@ -4426,7 +4639,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
stepInfo := cloneBenchmarkGPUInfoMap ( infoByIndex )
stepPowerStopCh := make ( chan struct { } )
stepPowerCh := startSelectedPowerSourceSampler ( stepPowerStopCh , opts . ServerPowerSource , benchmarkPowerAutotuneSampleInterval )
stepCalib , stepRestore , stepRows := runBenchmarkPowerCalibration ( ctx , verboseLog , stepDir , subset , stepInfo , logFunc , seedForStep , durationSec )
stepCalib , stepRestore , stepRows , stepRun := runBenchmarkPowerCalibration ( ctx , verboseLog , stepDir , subset , stepInfo , logFunc , seedForStep , durationSec )
appendBenchmarkMetrics ( & allPowerRows , stepRows , fmt . Sprintf ( "ramp-step-%d" , step ) , & powerCursor , 0 )
close ( stepPowerStopCh )
var stepIPMILoadedW float64
@@ -4497,10 +4710,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result . Findings = append ( result . Findings , fmt . Sprintf ( "Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load." , step , newGPUIdx , c . AppliedPowerLimitW ) )
}
// Per-step PSU slot snapshot — also used as the authoritative loaded pow er
// source when SDR PSU sensors a re available (more accurate than DCMI on
// servers where DCMI covers only a subset of installed PSUs).
sdrStep := sampleIPMISDRPowerSensors ( )
// Per-step PSU slot readings are averaged over the whole load phase rath er
// than captu red as a single end-of-phase snapshot.
sdrStep := stepRun . LoadedSDR
if len ( sdrStep . PSUSlots ) > 0 {
ramp . PSUSlotReadings = sdrStep . PSUSlots
}
@@ -4518,7 +4730,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
} else if opts . ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep . PSUInW > 0 {
ramp . ServerLoadedW = sdrStep . PSUInW
ramp . ServerDeltaW = sdrStep . PSUInW - sdrIdle . PSUInW
logFunc ( fmt . Sprintf ( "power ramp: step %d fallback server loaded power (SDR snapshot ): %.0f W" , step , sdrStep . PSUInW ) )
logFunc ( fmt . Sprintf ( "power ramp: step %d fallback server loaded power (SDR avg ): %.0f W" , step , sdrStep . PSUInW ) )
if step == len ( result . RecommendedSlotOrder ) {
serverLoadedW = sdrStep . PSUInW
serverLoadedOK = true
@@ -4526,12 +4738,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
}
// Fan state at end of ramp step .
if fans , err := sampleFanSpeeds ( ) ; err == nil && le n( fans ) > 0 {
ramp . AvgFanRPM = meanFanRPM ( fans )
if duty , ok , _ := sampleFanDutyCyclePctFromFans ( fans ) ; ok {
ramp . AvgFanDutyCyclePct = duty
}
// Fan values are phase averages over the same load window .
if stepRu n. AvgFanRPM > 0 {
ramp . AvgFanRPM = stepRun . AvgFanRPM
ramp . AvgFanDutyCyclePct = stepRun . AvgFanDutyCyclePct
}
// Per-GPU telemetry from this ramp step's calibration.
@@ -4584,8 +4794,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
if result . ServerPower != nil {
// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
// than re-sampling here, which would capture post-test idle state.
// Use the SDR phase average from the last ramp step (GPUs still loaded)
// rather than re-sampling here, which would capture post-test idle state.
sdrLoaded := sdrLastStep
result . ServerPower . PSUInputIdleW = sdrIdle . PSUInW
result . ServerPower . PSUInputLoadedW = sdrLoaded . PSUInW
@@ -4605,6 +4815,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result . ServerPower . Notes = append ( result . ServerPower . Notes ,
"SDR sensors skipped (self-healed): " + strings . Join ( sdrLoaded . SkippedSensors , "; " ) )
}
if sdrLoaded . Samples > 0 {
result . ServerPower . Notes = append ( result . ServerPower . Notes ,
fmt . Sprintf ( "Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step." , sdrLoaded . Samples ) )
}
// Detect DCMI partial coverage: direct SDR comparison first,
// ramp heuristic as fallback when SDR PSU sensors are absent.
dcmiUnreliable := detectDCMIPartialCoverage ( result . ServerPower ) ||