diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index f832944..e13cd8a 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -67,6 +67,13 @@ type benchmarkPowerCalibrationResult struct { MetricRows []GPUMetricRow } +type benchmarkPowerCalibrationRunSummary struct { + LoadedSDR benchmarkSDRSeriesSummary + AvgFanRPM float64 + AvgFanDutyCyclePct float64 + FanSamples int +} + type benchmarkBurnProfile struct { name string category string @@ -2413,6 +2420,16 @@ type sdrPowerSnapshot struct { SkippedSensors []string // sensors rejected during self-healing } +type benchmarkSDRSeriesSummary struct { + PSUInW float64 + PSUOutW float64 + GPUSlotW float64 + PSUSlots map[string]BenchmarkPSUSlotPower + Samples int + + SkippedSensors []string +} + // sdrSensor is a name+watts pair used for GPU slot self-healing filtering. type sdrSensor struct { name string @@ -2542,6 +2559,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot { return snap } +func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot { + if intervalSec <= 0 { + intervalSec = benchmarkPowerAutotuneSampleInterval + } + ch := make(chan []sdrPowerSnapshot, 1) + go func() { + defer close(ch) + var samples []sdrPowerSnapshot + record := func() { + snap := sampleIPMISDRPowerSensors() + if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 { + return + } + samples = append(samples, snap) + } + record() + ticker := time.NewTicker(time.Duration(intervalSec) * time.Second) + defer ticker.Stop() + for { + select { + case <-stopCh: + ch <- samples + return + case <-ticker.C: + record() + } + } + }() + return ch +} + +func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary { + var summary benchmarkSDRSeriesSummary + if len(samples) == 0 { + return summary + } + + type slotAggregate struct { + inputs []float64 + outputs []float64 + status string + } + + slotAgg := make(map[string]*slotAggregate) + skippedSet := make(map[string]struct{}) + var inputTotals []float64 + var outputTotals []float64 + var gpuSlotTotals []float64 + + for _, sample := range samples { + if sample.PSUInW > 0 { + inputTotals = append(inputTotals, sample.PSUInW) + } + if sample.PSUOutW > 0 { + outputTotals = append(outputTotals, sample.PSUOutW) + } + if sample.GPUSlotW > 0 { + gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW) + } + for _, skipped := range sample.SkippedSensors { + if skipped != "" { + skippedSet[skipped] = struct{}{} + } + } + for slot, reading := range sample.PSUSlots { + agg := slotAgg[slot] + if agg == nil { + agg = &slotAggregate{} + slotAgg[slot] = agg + } + if reading.InputW != nil && *reading.InputW > 0 { + agg.inputs = append(agg.inputs, *reading.InputW) + } + if reading.OutputW != nil && *reading.OutputW > 0 { + agg.outputs = append(agg.outputs, *reading.OutputW) + } + switch { + case reading.Status == "": + case agg.status == "": + agg.status = reading.Status + case agg.status == "OK" && reading.Status != "OK": + agg.status = reading.Status + } + } + } + + summary.PSUInW = benchmarkMean(inputTotals) + summary.PSUOutW = benchmarkMean(outputTotals) + summary.GPUSlotW = benchmarkMean(gpuSlotTotals) + summary.Samples = len(samples) + + if len(slotAgg) > 0 { + summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg)) + for slot, agg := range slotAgg { + reading := BenchmarkPSUSlotPower{Status: agg.status} + if mean := benchmarkMean(agg.inputs); mean > 0 { + v := mean + reading.InputW = &v + } + if mean := benchmarkMean(agg.outputs); mean > 0 { + v := mean + reading.OutputW = &v + } + summary.PSUSlots[slot] = reading + } + } + if len(skippedSet) > 0 { + summary.SkippedSensors = make([]string, 0, len(skippedSet)) + for skipped := range skippedSet { + summary.SkippedSensors = append(summary.SkippedSensors, skipped) + } + sort.Strings(summary.SkippedSensors) + } + + return summary +} + +func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary { + if durationSec <= 0 { + return benchmarkSDRSeriesSummary{} + } + stopCh := make(chan struct{}) + doneCh := startIPMISDRSampler(stopCh, intervalSec) + select { + case <-ctx.Done(): + case <-time.After(time.Duration(durationSec) * time.Second): + } + close(stopCh) + return summarizeSDRPowerSeries(<-doneCh) +} + // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi. // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed. func queryIPMIServerPowerW() (float64, error) { @@ -3086,8 +3234,9 @@ func runBenchmarkPowerCalibration( logFunc func(string), seedLimits map[int]int, durationSec int, -) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) { +) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) { calibDurationSec := durationSec + var runSummary benchmarkPowerCalibrationRunSummary if calibDurationSec <= 0 { calibDurationSec = 120 } @@ -3105,12 +3254,12 @@ func runBenchmarkPowerCalibration( if engine == BenchmarkPowerEngineTargetedPower { if _, err := exec.LookPath("dcgmi"); err != nil { logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") - return map[int]benchmarkPowerCalibrationResult{}, nil, nil + return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary } } else { if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil { logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)") - return map[int]benchmarkPowerCalibrationResult{}, nil, nil + return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary } } if killed := KillTestWorkers(); len(killed) > 0 { @@ -3275,6 +3424,10 @@ calibDone: } attemptCtx, cancelAttempt := context.WithCancel(ctx) doneCh := make(chan sharedAttemptResult, 1) + sdrStopCh := make(chan struct{}) + sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval) + fanStopCh := make(chan struct{}) + fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval) go func() { out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc) doneCh <- sharedAttemptResult{out: out, rows: rows, err: err} @@ -3314,6 +3467,10 @@ calibDone: } ticker.Stop() cancelAttempt() + close(sdrStopCh) + close(fanStopCh) + attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh) + attemptFanSummary := <-fanDoneCh _ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644) // Accumulate telemetry rows with attempt stage label. appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec)) @@ -3351,10 +3508,14 @@ calibDone: busyDelaySec = 1 // Per-GPU analysis and binary search update. + attemptStable := ar.err == nil for _, s := range active { perGPU := filterRowsByGPU(ar.rows, s.idx) summary := summarizeBenchmarkTelemetry(perGPU) throttle := throttleReasons[s.idx] + if throttle != "" || summary.P95PowerW <= 0 { + attemptStable = false + } // Cooling warning: thermal throttle with fans not at maximum. if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" { @@ -3487,6 +3648,16 @@ calibDone: s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi)) logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi)) } + if attemptStable { + if attemptSDRSummary.Samples > 0 { + runSummary.LoadedSDR = attemptSDRSummary + } + if attemptFanSummary.FanSamples > 0 { + runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM + runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct + runSummary.FanSamples = attemptFanSummary.FanSamples + } + } } for _, s := range states { @@ -3495,7 +3666,7 @@ calibDone: } } writeBenchmarkMetricsFiles(runDir, allCalibRows) - return results, restore, allCalibRows + return results, restore, allCalibRows, runSummary } // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222), @@ -3540,6 +3711,47 @@ func meanFanRPM(fans []FanReading) float64 { return sum / float64(len(fans)) } +func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary { + if intervalSec <= 0 { + intervalSec = benchmarkPowerAutotuneSampleInterval + } + ch := make(chan benchmarkPowerCalibrationRunSummary, 1) + go func() { + defer close(ch) + var rpmSamples []float64 + var dutySamples []float64 + record := func() { + fans, err := sampleFanSpeeds() + if err != nil || len(fans) == 0 { + return + } + if rpm := meanFanRPM(fans); rpm > 0 { + rpmSamples = append(rpmSamples, rpm) + } + if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 { + dutySamples = append(dutySamples, duty) + } + } + record() + ticker := time.NewTicker(time.Duration(intervalSec) * time.Second) + defer ticker.Stop() + for { + select { + case <-stopCh: + ch <- benchmarkPowerCalibrationRunSummary{ + AvgFanRPM: benchmarkMean(rpmSamples), + AvgFanDutyCyclePct: benchmarkMean(dutySamples), + FanSamples: len(rpmSamples), + } + return + case <-ticker.C: + record() + } + } + }() + return ch +} + func powerBenchDurationSec(profile string) int { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: @@ -3568,41 +3780,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW) if sp := result.ServerPower; sp != nil && sp.Available { - fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW) - if sp.PSUInputLoadedW > 0 { - psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW - fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta) + sourceLabel := "autotuned source" + switch normalizeBenchmarkPowerSource(sp.Source) { + case BenchmarkPowerSourceSDRPSUInput: + sourceLabel = "autotuned source (SDR PSU AC input)" + case BenchmarkPowerSourceDCMI: + sourceLabel = "autotuned source (DCMI)" } - fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio) + fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W \n", sourceLabel, sp.DeltaW) + fmt.Fprintf(&b, "**Reporting ratio:** %.2f \n", sp.ReportingRatio) } b.WriteString("\n") // Server power comparison table. if sp := result.ServerPower; sp != nil { b.WriteString("## Server vs GPU Power Comparison\n\n") + selectedSource := normalizeBenchmarkPowerSource(sp.Source) + selectedSourceLabel := "Selected source" + if selectedSource == BenchmarkPowerSourceSDRPSUInput { + selectedSourceLabel = "Selected source (SDR PSU AC input)" + } else if selectedSource == BenchmarkPowerSourceDCMI { + selectedSourceLabel = "Selected source (DCMI)" + } var spRows [][]string - spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)}) - spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)}) - if sp.GPUSlotTotalW > 0 { - spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)}) - } + spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)}) if sp.Available { - spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)}) - spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)}) - spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)}) + spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)}) + spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)}) + spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)}) } - if sp.PSUInputLoadedW > 0 { - spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)}) - spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)}) + if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 { + spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)}) + spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)}) psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW - spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)}) - } - if sp.PSUOutputLoadedW > 0 { - spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)}) - spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)}) - if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 { - psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100 - spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)}) - } + spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", fmt.Sprintf("%.0f W", psuDelta)}) } if sp.Available { ratio := sp.ReportingRatio @@ -3619,8 +3829,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { default: ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power" } - spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)}) - if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 { + spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)}) + if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 { psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW sdrRatio := psuDelta / sp.GPUReportedSumW sdrNote := "" @@ -3632,12 +3842,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { default: sdrNote = "✗ significant discrepancy" } - spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)}) + spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)}) } } else { - spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"}) + spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"}) } - b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows)) + b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows)) for _, note := range sp.Notes { fmt.Fprintf(&b, "\n> %s\n", note) } @@ -3689,11 +3899,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { psuDistRows = append(psuDistRows, []string{ slot, fmtW(idle.InputW), fmtW(loaded.InputW), - fmtW(idle.OutputW), fmtW(loaded.OutputW), deltaStr, status, }) } - b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows)) + b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows)) b.WriteString("\n") } } @@ -3741,7 +3950,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { fan, }) } - b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows)) + b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows)) b.WriteString("\n") } if len(result.RecommendedSlotOrder) > 0 { @@ -3850,7 +4059,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { for _, slot := range psuSlots { psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot)) } - psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)") + psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)") var psuRows [][]string for _, step := range result.RampSteps { @@ -3931,7 +4140,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } pdRows = append(pdRows, []string{ fmt.Sprintf("GPU %d", gpu.Index), - fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW), fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW), fmt.Sprintf("%.0f W", stable), realization, @@ -3944,13 +4152,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } pdRows = append(pdRows, []string{ "**Platform**", - fmt.Sprintf("**%.0f W**", totalDefault), "—", fmt.Sprintf("**%.0f W**", totalStable), fmt.Sprintf("**%s**", platformReal), "", }) - b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows)) + b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows)) b.WriteString("\n") // Balance across GPUs — only meaningful with 2+ GPUs. @@ -4100,7 +4307,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { {"Avg Temp °C", singleTemp}, {"Power W", singlePwr}, {"Per GPU wall W", singleWall}, - {"Fan RPM (duty%)", singleFan}, + {"Avg Fan RPM (duty%)", singleFan}, } if lastStep != nil { compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem)) @@ -4208,18 +4415,22 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // Sample server idle power before any GPU load. var serverIdleW float64 var serverIdleOK bool + idleSDRStopCh := make(chan struct{}) + idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval) if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok { serverIdleW = w serverIdleOK = true logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w)) } - sdrIdle := sampleIPMISDRPowerSensors() + close(idleSDRStopCh) + sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh) psuBefore := psuStatusSnapshot() // Phase 1: calibrate each GPU individually (sequentially, one at a time) to // establish a true single-card power baseline unaffected by neighbour heat. calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected)) singleIPMILoadedW := make(map[int]float64, len(selected)) + singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected)) var allRestoreActions []benchmarkRestoreAction // allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv. var allPowerRows []GPUMetricRow @@ -4235,21 +4446,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) singlePowerStopCh := make(chan struct{}) singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval) - c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec) + c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec) appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0) close(singlePowerStopCh) - sdrSingle := sampleIPMISDRPowerSensors() if samples := <-singlePowerCh; len(samples) > 0 { singleIPMILoadedW[idx] = benchmarkMean(samples) logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx])) - } else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 { - singleIPMILoadedW[idx] = sdrSingle.PSUInW - logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW)) + } else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 { + singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW + logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW)) } allRestoreActions = append(allRestoreActions, restore...) if r, ok := c[idx]; ok { calibByIndex[idx] = r } + singleRunSummaryByIndex[idx] = singleRun } defer func() { for i := len(allRestoreActions) - 1; i >= 0; i-- { @@ -4292,11 +4503,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N t := summarizeBenchmarkTelemetry(calib.MetricRows) gpu.Telemetry = &t } - if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 { - gpu.AvgFanRPM = meanFanRPM(fans) - if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok { - gpu.AvgFanDutyCyclePct = duty - } + if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 { + gpu.AvgFanRPM = singleRun.AvgFanRPM + gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct } gpus = append(gpus, gpu) } @@ -4352,10 +4561,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // per-step in NvidiaPowerBenchStep.ServerLoadedW. var serverLoadedW float64 var serverLoadedOK bool - // sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are - // still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling - // after the test when GPUs have already returned to idle. - var sdrLastStep sdrPowerSnapshot + // sdrLastStep retains the phase-averaged SDR readings from the last ramp step + // while GPUs are loaded. Used in the summary instead of re-sampling after the + // test when GPUs have already returned to idle. + var sdrLastStep benchmarkSDRSeriesSummary // Step 1: reuse single-card calibration result directly. if len(result.RecommendedSlotOrder) > 0 { @@ -4376,6 +4585,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N ramp.ServerLoadedW = w ramp.ServerDeltaW = w - serverIdleW } + if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 { + ramp.AvgFanRPM = singleRun.AvgFanRPM + ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct + } if !firstCalib.Completed { ramp.Status = "FAILED" ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine()))) @@ -4426,7 +4639,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) stepPowerStopCh := make(chan struct{}) stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval) - stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec) + stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec) appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0) close(stepPowerStopCh) var stepIPMILoadedW float64 @@ -4497,10 +4710,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) } - // Per-step PSU slot snapshot — also used as the authoritative loaded power - // source when SDR PSU sensors are available (more accurate than DCMI on - // servers where DCMI covers only a subset of installed PSUs). - sdrStep := sampleIPMISDRPowerSensors() + // Per-step PSU slot readings are averaged over the whole load phase rather + // than captured as a single end-of-phase snapshot. + sdrStep := stepRun.LoadedSDR if len(sdrStep.PSUSlots) > 0 { ramp.PSUSlotReadings = sdrStep.PSUSlots } @@ -4518,7 +4730,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 { ramp.ServerLoadedW = sdrStep.PSUInW ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW - logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW)) + logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW)) if step == len(result.RecommendedSlotOrder) { serverLoadedW = sdrStep.PSUInW serverLoadedOK = true @@ -4526,12 +4738,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } } - // Fan state at end of ramp step. - if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 { - ramp.AvgFanRPM = meanFanRPM(fans) - if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok { - ramp.AvgFanDutyCyclePct = duty - } + // Fan values are phase averages over the same load window. + if stepRun.AvgFanRPM > 0 { + ramp.AvgFanRPM = stepRun.AvgFanRPM + ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct } // Per-GPU telemetry from this ramp step's calibration. @@ -4584,8 +4794,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // Supplement DCMI with SDR multi-source data via collector's PSU slot patterns. // Per-slot readings enable correlation with audit HardwarePowerSupply entries. if result.ServerPower != nil { - // Use the SDR snapshot from the last ramp step (GPUs still loaded) rather - // than re-sampling here, which would capture post-test idle state. + // Use the SDR phase average from the last ramp step (GPUs still loaded) + // rather than re-sampling here, which would capture post-test idle state. sdrLoaded := sdrLastStep result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW @@ -4605,6 +4815,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N result.ServerPower.Notes = append(result.ServerPower.Notes, "SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; ")) } + if sdrLoaded.Samples > 0 { + result.ServerPower.Notes = append(result.ServerPower.Notes, + fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples)) + } // Detect DCMI partial coverage: direct SDR comparison first, // ramp heuristic as fallback when SDR PSU sensors are absent. dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||