Power bench: compare GPU-reported TDP vs IPMI server power delta
- NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower - RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via background goroutine throughout Phase 2 ramp - renderPowerBenchReport: new "Server vs GPU Power Comparison" table with ratio annotation (✓ match / ⚠ minor / ✗ over-report) - renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w, server_reporting_ratio keys Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3107,7 +3107,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW)
|
||||
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
|
||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||
fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
// Server power comparison table.
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||
b.WriteString("| Metric | Value |\n")
|
||||
b.WriteString("|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
|
||||
if sp.Available {
|
||||
fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | %.0f W |\n", sp.DeltaW)
|
||||
ratio := sp.ReportingRatio
|
||||
ratioNote := ""
|
||||
switch {
|
||||
case ratio >= 0.9:
|
||||
ratioNote = "✓ GPU telemetry matches server power"
|
||||
case ratio >= 0.75:
|
||||
ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
|
||||
default:
|
||||
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||
}
|
||||
fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
|
||||
} else {
|
||||
b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
if len(result.Findings) > 0 {
|
||||
b.WriteString("## Summary\n\n")
|
||||
for _, finding := range result.Findings {
|
||||
@@ -3181,6 +3216,12 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
||||
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
||||
}
|
||||
}
|
||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -3224,6 +3265,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
_ = durationSec
|
||||
|
||||
// Sample IPMI idle power before any GPU load.
|
||||
var serverIdleW float64
|
||||
var serverIdleOK bool
|
||||
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
|
||||
serverIdleW = w
|
||||
serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||
}
|
||||
|
||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||
@@ -3320,6 +3371,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
||||
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
||||
|
||||
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
|
||||
// server-side loaded power while GPUs are under stress. The goroutine is
|
||||
// cancelled as soon as Phase 2 finishes, and the average is used to compare
|
||||
// against PlatformMaxTDPW (GPU-reported stable limits sum).
|
||||
var serverLoadedW float64
|
||||
var serverLoadedOK bool
|
||||
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
|
||||
ipmiPhase2Done := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiPhase2Done)
|
||||
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
|
||||
ipmiPhase2Done <- w
|
||||
}
|
||||
}()
|
||||
|
||||
// Step 1: reuse single-card calibration result directly.
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
firstIdx := result.RecommendedSlotOrder[0]
|
||||
@@ -3416,6 +3482,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.RampSteps = append(result.RampSteps, ramp)
|
||||
}
|
||||
|
||||
// Stop IPMI Phase 2 sampling and collect result.
|
||||
ipmiPhase2Cancel()
|
||||
if w, ok := <-ipmiPhase2Done; ok {
|
||||
serverLoadedW = w
|
||||
serverLoadedOK = true
|
||||
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
|
||||
}
|
||||
|
||||
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
||||
for i := range result.GPUs {
|
||||
if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
|
||||
@@ -3428,6 +3502,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
for _, lim := range stableLimits {
|
||||
result.PlatformMaxTDPW += float64(lim)
|
||||
}
|
||||
|
||||
// Characterize server power from IPMI idle/loaded samples.
|
||||
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
|
||||
// ReportingRatio = IPMI_delta / GPU_reported_sum:
|
||||
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshal power result: %w", err)
|
||||
|
||||
@@ -300,8 +300,12 @@ type NvidiaPowerBenchResult struct {
|
||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||
// this server under full GPU load. Use for rack power planning.
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||
// actual wall-power draw as seen by the server's power supply.
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user