power ramp: fix missing step-1 GPU telemetry, add GPU/server power breakdown

Ramp Sequence table's Run 1 row showed "—" for GPU power because the
step-1 fast path (reusing single-card calibration) never populated
PerGPUTelemetry like steps 2+ do. Also add GPU total W / Server itself W
columns and an idle baseline row so server-vs-GPU consumption is visible
per ramp step.
This commit is contained in:
Mikhail Chusavitin
2026-07-01 17:39:58 +03:00
parent 796acdfec1
commit 1966d8e408

View File

@@ -4008,14 +4008,23 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
idleW = result.ServerPower.IdleW
}
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
// Build header: Run | GPU 0 | GPU 1 | ... | GPU total W | Server itself W | Server wall W | Per GPU wall W | Platform eff.
headers := []string{"Run"}
for _, idx := range allGPUIndices {
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
}
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
headers = append(headers, "GPU total W", "Server itself W", "Server wall W", "Per GPU wall W", "Platform eff.")
var rampRows [][]string
if idleW > 0 {
idleRow := []string{"0 (idle)"}
for range allGPUIndices {
idleRow = append(idleRow, "—")
}
// No load: GPU total is negligible, all draw is the server's own baseline.
idleRow = append(idleRow, "—", fmt.Sprintf("%.0f", idleW), fmt.Sprintf("%.0f", idleW), "—", "—")
rampRows = append(rampRows, idleRow)
}
for _, step := range result.RampSteps {
row := []string{fmt.Sprintf("%d", step.StepIndex)}
for _, idx := range allGPUIndices {
@@ -4036,6 +4045,16 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
row = append(row, gpuPwr)
}
// GPU total W = sum of observed GPU power (nvidia-smi)
gpuTotal := "—"
if step.TotalObservedPowerW > 0 {
gpuTotal = fmt.Sprintf("%.0f", step.TotalObservedPowerW)
}
// Server itself W = server wall power minus GPU total (non-GPU baseline draw)
serverItself := "—"
if step.ServerLoadedW > 0 && step.TotalObservedPowerW > 0 {
serverItself = fmt.Sprintf("%.0f", step.ServerLoadedW-step.TotalObservedPowerW)
}
// Server wall W
serverWall := "—"
if step.ServerLoadedW > 0 {
@@ -4055,7 +4074,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
platEff = fmt.Sprintf("%.2f", eff)
}
row = append(row, serverWall, perGPUWall, platEff)
row = append(row, gpuTotal, serverItself, serverWall, perGPUWall, platEff)
rampRows = append(rampRows, row)
}
b.WriteString(fmtMDTable(headers, rampRows))
@@ -4617,6 +4636,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp.AvgFanRPM = singleRun.AvgFanRPM
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
}
firstSummary := firstCalib.Summary
ramp.PerGPUTelemetry = map[int]*BenchmarkTelemetrySummary{firstIdx: &firstSummary}
if !firstCalib.Completed {
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))