From 11ea64062678c05f0ce7be1764329e1ba6707918 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 1 Jul 2026 17:39:58 +0300 Subject: [PATCH] power ramp: fix missing step-1 GPU telemetry, add GPU/server power breakdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ramp Sequence table's Run 1 row showed "—" for GPU power because the step-1 fast path (reusing single-card calibration) never populated PerGPUTelemetry like steps 2+ do. Also add GPU total W / Server itself W columns and an idle baseline row so server-vs-GPU consumption is visible per ramp step. --- audit/internal/platform/benchmark.go | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 840e22c..a630707 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -4008,14 +4008,23 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { idleW = result.ServerPower.IdleW } - // Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff. + // Build header: Run | GPU 0 | GPU 1 | ... | GPU total W | Server itself W | Server wall W | Per GPU wall W | Platform eff. headers := []string{"Run"} for _, idx := range allGPUIndices { headers = append(headers, fmt.Sprintf("GPU %d W", idx)) } - headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.") + headers = append(headers, "GPU total W", "Server itself W", "Server wall W", "Per GPU wall W", "Platform eff.") var rampRows [][]string + if idleW > 0 { + idleRow := []string{"0 (idle)"} + for range allGPUIndices { + idleRow = append(idleRow, "—") + } + // No load: GPU total is negligible, all draw is the server's own baseline. + idleRow = append(idleRow, "—", fmt.Sprintf("%.0f", idleW), fmt.Sprintf("%.0f", idleW), "—", "—") + rampRows = append(rampRows, idleRow) + } for _, step := range result.RampSteps { row := []string{fmt.Sprintf("%d", step.StepIndex)} for _, idx := range allGPUIndices { @@ -4036,6 +4045,16 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } row = append(row, gpuPwr) } + // GPU total W = sum of observed GPU power (nvidia-smi) + gpuTotal := "—" + if step.TotalObservedPowerW > 0 { + gpuTotal = fmt.Sprintf("%.0f", step.TotalObservedPowerW) + } + // Server itself W = server wall power minus GPU total (non-GPU baseline draw) + serverItself := "—" + if step.ServerLoadedW > 0 && step.TotalObservedPowerW > 0 { + serverItself = fmt.Sprintf("%.0f", step.ServerLoadedW-step.TotalObservedPowerW) + } // Server wall W serverWall := "—" if step.ServerLoadedW > 0 { @@ -4055,7 +4074,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } platEff = fmt.Sprintf("%.2f", eff) } - row = append(row, serverWall, perGPUWall, platEff) + row = append(row, gpuTotal, serverItself, serverWall, perGPUWall, platEff) rampRows = append(rampRows, row) } b.WriteString(fmtMDTable(headers, rampRows)) @@ -4617,6 +4636,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N ramp.AvgFanRPM = singleRun.AvgFanRPM ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct } + firstSummary := firstCalib.Summary + ramp.PerGPUTelemetry = map[int]*BenchmarkTelemetrySummary{firstIdx: &firstSummary} if !firstCalib.Completed { ramp.Status = "FAILED" ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))