diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 840e22c..a630707 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -4008,14 +4008,23 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { idleW = result.ServerPower.IdleW } - // Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff. + // Build header: Run | GPU 0 | GPU 1 | ... | GPU total W | Server itself W | Server wall W | Per GPU wall W | Platform eff. headers := []string{"Run"} for _, idx := range allGPUIndices { headers = append(headers, fmt.Sprintf("GPU %d W", idx)) } - headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.") + headers = append(headers, "GPU total W", "Server itself W", "Server wall W", "Per GPU wall W", "Platform eff.") var rampRows [][]string + if idleW > 0 { + idleRow := []string{"0 (idle)"} + for range allGPUIndices { + idleRow = append(idleRow, "—") + } + // No load: GPU total is negligible, all draw is the server's own baseline. + idleRow = append(idleRow, "—", fmt.Sprintf("%.0f", idleW), fmt.Sprintf("%.0f", idleW), "—", "—") + rampRows = append(rampRows, idleRow) + } for _, step := range result.RampSteps { row := []string{fmt.Sprintf("%d", step.StepIndex)} for _, idx := range allGPUIndices { @@ -4036,6 +4045,16 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } row = append(row, gpuPwr) } + // GPU total W = sum of observed GPU power (nvidia-smi) + gpuTotal := "—" + if step.TotalObservedPowerW > 0 { + gpuTotal = fmt.Sprintf("%.0f", step.TotalObservedPowerW) + } + // Server itself W = server wall power minus GPU total (non-GPU baseline draw) + serverItself := "—" + if step.ServerLoadedW > 0 && step.TotalObservedPowerW > 0 { + serverItself = fmt.Sprintf("%.0f", step.ServerLoadedW-step.TotalObservedPowerW) + } // Server wall W serverWall := "—" if step.ServerLoadedW > 0 { @@ -4055,7 +4074,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } platEff = fmt.Sprintf("%.2f", eff) } - row = append(row, serverWall, perGPUWall, platEff) + row = append(row, gpuTotal, serverItself, serverWall, perGPUWall, platEff) rampRows = append(rampRows, row) } b.WriteString(fmtMDTable(headers, rampRows)) @@ -4617,6 +4636,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N ramp.AvgFanRPM = singleRun.AvgFanRPM ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct } + firstSummary := firstCalib.Summary + ramp.PerGPUTelemetry = map[int]*BenchmarkTelemetrySummary{firstIdx: &firstSummary} if !firstCalib.Completed { ramp.Status = "FAILED" ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))