Make power benchmark report phase-averaged

This commit is contained in:
Mikhail Chusavitin
2026-04-20 10:53:53 +03:00
parent 5f0103635b
commit 6caace0cc0

View File

@@ -67,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
MetricRows []GPUMetricRow
}
type benchmarkPowerCalibrationRunSummary struct {
LoadedSDR benchmarkSDRSeriesSummary
AvgFanRPM float64
AvgFanDutyCyclePct float64
FanSamples int
}
type benchmarkBurnProfile struct {
name string
category string
@@ -2413,6 +2420,16 @@ type sdrPowerSnapshot struct {
SkippedSensors []string // sensors rejected during self-healing
}
type benchmarkSDRSeriesSummary struct {
PSUInW float64
PSUOutW float64
GPUSlotW float64
PSUSlots map[string]BenchmarkPSUSlotPower
Samples int
SkippedSensors []string
}
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
type sdrSensor struct {
name string
@@ -2542,6 +2559,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
return snap
}
func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot {
if intervalSec <= 0 {
intervalSec = benchmarkPowerAutotuneSampleInterval
}
ch := make(chan []sdrPowerSnapshot, 1)
go func() {
defer close(ch)
var samples []sdrPowerSnapshot
record := func() {
snap := sampleIPMISDRPowerSensors()
if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 {
return
}
samples = append(samples, snap)
}
record()
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
ch <- samples
return
case <-ticker.C:
record()
}
}
}()
return ch
}
func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary {
var summary benchmarkSDRSeriesSummary
if len(samples) == 0 {
return summary
}
type slotAggregate struct {
inputs []float64
outputs []float64
status string
}
slotAgg := make(map[string]*slotAggregate)
skippedSet := make(map[string]struct{})
var inputTotals []float64
var outputTotals []float64
var gpuSlotTotals []float64
for _, sample := range samples {
if sample.PSUInW > 0 {
inputTotals = append(inputTotals, sample.PSUInW)
}
if sample.PSUOutW > 0 {
outputTotals = append(outputTotals, sample.PSUOutW)
}
if sample.GPUSlotW > 0 {
gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW)
}
for _, skipped := range sample.SkippedSensors {
if skipped != "" {
skippedSet[skipped] = struct{}{}
}
}
for slot, reading := range sample.PSUSlots {
agg := slotAgg[slot]
if agg == nil {
agg = &slotAggregate{}
slotAgg[slot] = agg
}
if reading.InputW != nil && *reading.InputW > 0 {
agg.inputs = append(agg.inputs, *reading.InputW)
}
if reading.OutputW != nil && *reading.OutputW > 0 {
agg.outputs = append(agg.outputs, *reading.OutputW)
}
switch {
case reading.Status == "":
case agg.status == "":
agg.status = reading.Status
case agg.status == "OK" && reading.Status != "OK":
agg.status = reading.Status
}
}
}
summary.PSUInW = benchmarkMean(inputTotals)
summary.PSUOutW = benchmarkMean(outputTotals)
summary.GPUSlotW = benchmarkMean(gpuSlotTotals)
summary.Samples = len(samples)
if len(slotAgg) > 0 {
summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg))
for slot, agg := range slotAgg {
reading := BenchmarkPSUSlotPower{Status: agg.status}
if mean := benchmarkMean(agg.inputs); mean > 0 {
v := mean
reading.InputW = &v
}
if mean := benchmarkMean(agg.outputs); mean > 0 {
v := mean
reading.OutputW = &v
}
summary.PSUSlots[slot] = reading
}
}
if len(skippedSet) > 0 {
summary.SkippedSensors = make([]string, 0, len(skippedSet))
for skipped := range skippedSet {
summary.SkippedSensors = append(summary.SkippedSensors, skipped)
}
sort.Strings(summary.SkippedSensors)
}
return summary
}
func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary {
if durationSec <= 0 {
return benchmarkSDRSeriesSummary{}
}
stopCh := make(chan struct{})
doneCh := startIPMISDRSampler(stopCh, intervalSec)
select {
case <-ctx.Done():
case <-time.After(time.Duration(durationSec) * time.Second):
}
close(stopCh)
return summarizeSDRPowerSeries(<-doneCh)
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
@@ -3086,8 +3234,9 @@ func runBenchmarkPowerCalibration(
logFunc func(string),
seedLimits map[int]int,
durationSec int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) {
calibDurationSec := durationSec
var runSummary benchmarkPowerCalibrationRunSummary
if calibDurationSec <= 0 {
calibDurationSec = 120
}
@@ -3105,12 +3254,12 @@ func runBenchmarkPowerCalibration(
if engine == BenchmarkPowerEngineTargetedPower {
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
}
} else {
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
}
}
if killed := KillTestWorkers(); len(killed) > 0 {
@@ -3275,6 +3424,10 @@ calibDone:
}
attemptCtx, cancelAttempt := context.WithCancel(ctx)
doneCh := make(chan sharedAttemptResult, 1)
sdrStopCh := make(chan struct{})
sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval)
fanStopCh := make(chan struct{})
fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval)
go func() {
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
@@ -3314,6 +3467,10 @@ calibDone:
}
ticker.Stop()
cancelAttempt()
close(sdrStopCh)
close(fanStopCh)
attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh)
attemptFanSummary := <-fanDoneCh
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
// Accumulate telemetry rows with attempt stage label.
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
@@ -3351,10 +3508,14 @@ calibDone:
busyDelaySec = 1
// Per-GPU analysis and binary search update.
attemptStable := ar.err == nil
for _, s := range active {
perGPU := filterRowsByGPU(ar.rows, s.idx)
summary := summarizeBenchmarkTelemetry(perGPU)
throttle := throttleReasons[s.idx]
if throttle != "" || summary.P95PowerW <= 0 {
attemptStable = false
}
// Cooling warning: thermal throttle with fans not at maximum.
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
@@ -3487,6 +3648,16 @@ calibDone:
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
}
if attemptStable {
if attemptSDRSummary.Samples > 0 {
runSummary.LoadedSDR = attemptSDRSummary
}
if attemptFanSummary.FanSamples > 0 {
runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM
runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct
runSummary.FanSamples = attemptFanSummary.FanSamples
}
}
}
for _, s := range states {
@@ -3495,7 +3666,7 @@ calibDone:
}
}
writeBenchmarkMetricsFiles(runDir, allCalibRows)
return results, restore, allCalibRows
return results, restore, allCalibRows, runSummary
}
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3540,6 +3711,47 @@ func meanFanRPM(fans []FanReading) float64 {
return sum / float64(len(fans))
}
func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary {
if intervalSec <= 0 {
intervalSec = benchmarkPowerAutotuneSampleInterval
}
ch := make(chan benchmarkPowerCalibrationRunSummary, 1)
go func() {
defer close(ch)
var rpmSamples []float64
var dutySamples []float64
record := func() {
fans, err := sampleFanSpeeds()
if err != nil || len(fans) == 0 {
return
}
if rpm := meanFanRPM(fans); rpm > 0 {
rpmSamples = append(rpmSamples, rpm)
}
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 {
dutySamples = append(dutySamples, duty)
}
}
record()
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
ch <- benchmarkPowerCalibrationRunSummary{
AvgFanRPM: benchmarkMean(rpmSamples),
AvgFanDutyCyclePct: benchmarkMean(dutySamples),
FanSamples: len(rpmSamples),
}
return
case <-ticker.C:
record()
}
}
}()
return ch
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
@@ -3568,41 +3780,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW)
if sp.PSUInputLoadedW > 0 {
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta)
sourceLabel := "autotuned source"
switch normalizeBenchmarkPowerSource(sp.Source) {
case BenchmarkPowerSourceSDRPSUInput:
sourceLabel = "autotuned source (SDR PSU AC input)"
case BenchmarkPowerSourceDCMI:
sourceLabel = "autotuned source (DCMI)"
}
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio)
fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W \n", sourceLabel, sp.DeltaW)
fmt.Fprintf(&b, "**Reporting ratio:** %.2f \n", sp.ReportingRatio)
}
b.WriteString("\n")
// Server power comparison table.
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n")
selectedSource := normalizeBenchmarkPowerSource(sp.Source)
selectedSourceLabel := "Selected source"
if selectedSource == BenchmarkPowerSourceSDRPSUInput {
selectedSourceLabel = "Selected source (SDR PSU AC input)"
} else if selectedSource == BenchmarkPowerSourceDCMI {
selectedSourceLabel = "Selected source (DCMI)"
}
var spRows [][]string
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
if sp.GPUSlotTotalW > 0 {
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
}
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
if sp.Available {
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
spRows = append(spRows, []string{"Server Δ power (loaded idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)})
spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)})
spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded idle)", fmt.Sprintf("%.0f W", sp.DeltaW)})
}
if sp.PSUInputLoadedW > 0 {
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 {
spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
spRows = append(spRows, []string{"PSU AC input Δ (loaded idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
}
if sp.PSUOutputLoadedW > 0 {
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
}
spRows = append(spRows, []string{"PSU AC input Δ (loaded idle)", fmt.Sprintf("%.0f W", psuDelta)})
}
if sp.Available {
ratio := sp.ReportingRatio
@@ -3619,8 +3829,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
sdrRatio := psuDelta / sp.GPUReportedSumW
sdrNote := ""
@@ -3632,12 +3842,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default:
sdrNote = "✗ significant discrepancy"
}
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
}
} else {
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"})
}
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows))
for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note)
}
@@ -3689,11 +3899,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
psuDistRows = append(psuDistRows, []string{
slot,
fmtW(idle.InputW), fmtW(loaded.InputW),
fmtW(idle.OutputW), fmtW(loaded.OutputW),
deltaStr, status,
})
}
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows))
b.WriteString("\n")
}
}
@@ -3741,7 +3950,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fan,
})
}
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows))
b.WriteString("\n")
}
if len(result.RecommendedSlotOrder) > 0 {
@@ -3850,7 +4059,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
for _, slot := range psuSlots {
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
}
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)")
var psuRows [][]string
for _, step := range result.RampSteps {
@@ -3931,7 +4140,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
pdRows = append(pdRows, []string{
fmt.Sprintf("GPU %d", gpu.Index),
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
fmt.Sprintf("%.0f W", stable),
realization,
@@ -3944,13 +4152,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
pdRows = append(pdRows, []string{
"**Platform**",
fmt.Sprintf("**%.0f W**", totalDefault),
"—",
fmt.Sprintf("**%.0f W**", totalStable),
fmt.Sprintf("**%s**", platformReal),
"",
})
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
b.WriteString("\n")
// Balance across GPUs — only meaningful with 2+ GPUs.
@@ -4100,7 +4307,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
{"Avg Temp °C", singleTemp},
{"Power W", singlePwr},
{"Per GPU wall W", singleWall},
{"Fan RPM (duty%)", singleFan},
{"Avg Fan RPM (duty%)", singleFan},
}
if lastStep != nil {
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
@@ -4208,18 +4415,22 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// Sample server idle power before any GPU load.
var serverIdleW float64
var serverIdleOK bool
idleSDRStopCh := make(chan struct{})
idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval)
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
}
sdrIdle := sampleIPMISDRPowerSensors()
close(idleSDRStopCh)
sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh)
psuBefore := psuStatusSnapshot()
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
singleIPMILoadedW := make(map[int]float64, len(selected))
singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected))
var allRestoreActions []benchmarkRestoreAction
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
var allPowerRows []GPUMetricRow
@@ -4235,21 +4446,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
singlePowerStopCh := make(chan struct{})
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
close(singlePowerStopCh)
sdrSingle := sampleIPMISDRPowerSensors()
if samples := <-singlePowerCh; len(samples) > 0 {
singleIPMILoadedW[idx] = benchmarkMean(samples)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
singleIPMILoadedW[idx] = sdrSingle.PSUInW
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 {
singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW))
}
allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok {
calibByIndex[idx] = r
}
singleRunSummaryByIndex[idx] = singleRun
}
defer func() {
for i := len(allRestoreActions) - 1; i >= 0; i-- {
@@ -4292,11 +4503,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
t := summarizeBenchmarkTelemetry(calib.MetricRows)
gpu.Telemetry = &t
}
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
gpu.AvgFanRPM = meanFanRPM(fans)
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
gpu.AvgFanDutyCyclePct = duty
}
if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 {
gpu.AvgFanRPM = singleRun.AvgFanRPM
gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
}
gpus = append(gpus, gpu)
}
@@ -4352,10 +4561,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
var serverLoadedW float64
var serverLoadedOK bool
// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
// after the test when GPUs have already returned to idle.
var sdrLastStep sdrPowerSnapshot
// sdrLastStep retains the phase-averaged SDR readings from the last ramp step
// while GPUs are loaded. Used in the summary instead of re-sampling after the
// test when GPUs have already returned to idle.
var sdrLastStep benchmarkSDRSeriesSummary
// Step 1: reuse single-card calibration result directly.
if len(result.RecommendedSlotOrder) > 0 {
@@ -4376,6 +4585,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp.ServerLoadedW = w
ramp.ServerDeltaW = w - serverIdleW
}
if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 {
ramp.AvgFanRPM = singleRun.AvgFanRPM
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
}
if !firstCalib.Completed {
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
@@ -4426,7 +4639,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepPowerStopCh := make(chan struct{})
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
close(stepPowerStopCh)
var stepIPMILoadedW float64
@@ -4497,10 +4710,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
}
// Per-step PSU slot snapshot — also used as the authoritative loaded power
// source when SDR PSU sensors are available (more accurate than DCMI on
// servers where DCMI covers only a subset of installed PSUs).
sdrStep := sampleIPMISDRPowerSensors()
// Per-step PSU slot readings are averaged over the whole load phase rather
// than captured as a single end-of-phase snapshot.
sdrStep := stepRun.LoadedSDR
if len(sdrStep.PSUSlots) > 0 {
ramp.PSUSlotReadings = sdrStep.PSUSlots
}
@@ -4518,7 +4730,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
ramp.ServerLoadedW = sdrStep.PSUInW
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW))
if step == len(result.RecommendedSlotOrder) {
serverLoadedW = sdrStep.PSUInW
serverLoadedOK = true
@@ -4526,12 +4738,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
}
// Fan state at end of ramp step.
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
ramp.AvgFanRPM = meanFanRPM(fans)
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
ramp.AvgFanDutyCyclePct = duty
}
// Fan values are phase averages over the same load window.
if stepRun.AvgFanRPM > 0 {
ramp.AvgFanRPM = stepRun.AvgFanRPM
ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct
}
// Per-GPU telemetry from this ramp step's calibration.
@@ -4584,8 +4794,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
if result.ServerPower != nil {
// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
// than re-sampling here, which would capture post-test idle state.
// Use the SDR phase average from the last ramp step (GPUs still loaded)
// rather than re-sampling here, which would capture post-test idle state.
sdrLoaded := sdrLastStep
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
@@ -4605,6 +4815,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.ServerPower.Notes = append(result.ServerPower.Notes,
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
}
if sdrLoaded.Samples > 0 {
result.ServerPower.Notes = append(result.ServerPower.Notes,
fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples))
}
// Detect DCMI partial coverage: direct SDR comparison first,
// ramp heuristic as fallback when SDR PSU sensors are absent.
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||