Power bench: compare GPU-reported TDP vs IPMI server power delta

- NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower
- RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via
  background goroutine throughout Phase 2 ramp
- renderPowerBenchReport: new "Server vs GPU Power Comparison" table
  with ratio annotation (✓ match / ⚠ minor / ✗ over-report)
- renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w,
  server_reporting_ratio keys

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 07:21:02 +03:00
parent 30aa30cd67
commit 434528083e
2 changed files with 88 additions and 3 deletions

View File

@@ -3107,7 +3107,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW)
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW)
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio)
}
b.WriteString("\n")
// Server power comparison table.
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n")
b.WriteString("| Metric | Value |\n")
b.WriteString("|--------|-------|\n")
fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
if sp.Available {
fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server Δ power (loaded idle) | %.0f W |\n", sp.DeltaW)
ratio := sp.ReportingRatio
ratioNote := ""
switch {
case ratio >= 0.9:
ratioNote = "✓ GPU telemetry matches server power"
case ratio >= 0.75:
ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
} else {
b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
}
for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note)
}
b.WriteString("\n")
}
if len(result.Findings) > 0 {
b.WriteString("## Summary\n\n")
for _, finding := range result.Findings {
@@ -3181,6 +3216,12 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
}
}
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW)
fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW)
fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio)
}
return b.String()
}
@@ -3224,6 +3265,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
// Sample IPMI idle power before any GPU load.
var serverIdleW float64
var serverIdleOK bool
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
@@ -3320,6 +3371,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
// server-side loaded power while GPUs are under stress. The goroutine is
// cancelled as soon as Phase 2 finishes, and the average is used to compare
// against PlatformMaxTDPW (GPU-reported stable limits sum).
var serverLoadedW float64
var serverLoadedOK bool
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
ipmiPhase2Done := make(chan float64, 1)
go func() {
defer close(ipmiPhase2Done)
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
ipmiPhase2Done <- w
}
}()
// Step 1: reuse single-card calibration result directly.
if len(result.RecommendedSlotOrder) > 0 {
firstIdx := result.RecommendedSlotOrder[0]
@@ -3416,6 +3482,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.RampSteps = append(result.RampSteps, ramp)
}
// Stop IPMI Phase 2 sampling and collect result.
ipmiPhase2Cancel()
if w, ok := <-ipmiPhase2Done; ok {
serverLoadedW = w
serverLoadedOK = true
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
}
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
for i := range result.GPUs {
if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
@@ -3428,6 +3502,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
for _, lim := range stableLimits {
result.PlatformMaxTDPW += float64(lim)
}
// Characterize server power from IPMI idle/loaded samples.
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
// ReportingRatio = IPMI_delta / GPU_reported_sum:
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
_ = serverIdleOK // used implicitly via characterizeServerPower
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal power result: %w", err)

View File

@@ -300,8 +300,12 @@ type NvidiaPowerBenchResult struct {
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
// cumulative thermal ramp. Represents the actual sustained power budget of
// this server under full GPU load. Use for rack power planning.
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
Findings []string `json:"findings,omitempty"`
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
// ServerPower captures IPMI server power delta (idle→loaded) measured in
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
// actual wall-power draw as seen by the server's power supply.
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
}