power ramp: fix missing step-1 GPU telemetry, add GPU/server power breakdown
Ramp Sequence table's Run 1 row showed "—" for GPU power because the step-1 fast path (reusing single-card calibration) never populated PerGPUTelemetry like steps 2+ do. Also add GPU total W / Server itself W columns and an idle baseline row so server-vs-GPU consumption is visible per ramp step.
This commit is contained in:
@@ -4008,14 +4008,23 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
idleW = result.ServerPower.IdleW
|
||||
}
|
||||
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | GPU total W | Server itself W | Server wall W | Per GPU wall W | Platform eff.
|
||||
headers := []string{"Run"}
|
||||
for _, idx := range allGPUIndices {
|
||||
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
|
||||
}
|
||||
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
headers = append(headers, "GPU total W", "Server itself W", "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
|
||||
var rampRows [][]string
|
||||
if idleW > 0 {
|
||||
idleRow := []string{"0 (idle)"}
|
||||
for range allGPUIndices {
|
||||
idleRow = append(idleRow, "—")
|
||||
}
|
||||
// No load: GPU total is negligible, all draw is the server's own baseline.
|
||||
idleRow = append(idleRow, "—", fmt.Sprintf("%.0f", idleW), fmt.Sprintf("%.0f", idleW), "—", "—")
|
||||
rampRows = append(rampRows, idleRow)
|
||||
}
|
||||
for _, step := range result.RampSteps {
|
||||
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||
for _, idx := range allGPUIndices {
|
||||
@@ -4036,6 +4045,16 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
row = append(row, gpuPwr)
|
||||
}
|
||||
// GPU total W = sum of observed GPU power (nvidia-smi)
|
||||
gpuTotal := "—"
|
||||
if step.TotalObservedPowerW > 0 {
|
||||
gpuTotal = fmt.Sprintf("%.0f", step.TotalObservedPowerW)
|
||||
}
|
||||
// Server itself W = server wall power minus GPU total (non-GPU baseline draw)
|
||||
serverItself := "—"
|
||||
if step.ServerLoadedW > 0 && step.TotalObservedPowerW > 0 {
|
||||
serverItself = fmt.Sprintf("%.0f", step.ServerLoadedW-step.TotalObservedPowerW)
|
||||
}
|
||||
// Server wall W
|
||||
serverWall := "—"
|
||||
if step.ServerLoadedW > 0 {
|
||||
@@ -4055,7 +4074,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
platEff = fmt.Sprintf("%.2f", eff)
|
||||
}
|
||||
row = append(row, serverWall, perGPUWall, platEff)
|
||||
row = append(row, gpuTotal, serverItself, serverWall, perGPUWall, platEff)
|
||||
rampRows = append(rampRows, row)
|
||||
}
|
||||
b.WriteString(fmtMDTable(headers, rampRows))
|
||||
@@ -4617,6 +4636,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ramp.AvgFanRPM = singleRun.AvgFanRPM
|
||||
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||
}
|
||||
firstSummary := firstCalib.Summary
|
||||
ramp.PerGPUTelemetry = map[int]*BenchmarkTelemetrySummary{firstIdx: &firstSummary}
|
||||
if !firstCalib.Completed {
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||
|
||||
Reference in New Issue
Block a user