Power bench: compare GPU-reported TDP vs IPMI server power delta

- NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower - RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via background goroutine throughout Phase 2 ramp - renderPowerBenchReport: new "Server vs GPU Power Comparison" table with ratio annotation (✓ match / ⚠ minor / ✗ over-report) - renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w, server_reporting_ratio keys Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 07:21:02 +03:00
parent 30aa30cd67
commit 434528083e
2 changed files with 88 additions and 3 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -3107,7 +3107,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
-	fmt.Fprintf(&b, "**Platform max TDP:** %.0f W  \n\n", result.PlatformMaxTDPW)
+	fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W  \n", result.PlatformMaxTDPW)
+	if sp := result.ServerPower; sp != nil && sp.Available {
+		fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W  \n", sp.DeltaW)
+		fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f  \n", sp.ReportingRatio)
+	}
+	b.WriteString("\n")
+	// Server power comparison table.
+	if sp := result.ServerPower; sp != nil {
+		b.WriteString("## Server vs GPU Power Comparison\n\n")
+		b.WriteString("| Metric | Value |\n")
+		b.WriteString("|--------|-------|\n")
+		fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
+		if sp.Available {
+			fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
+			fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
+			fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | %.0f W |\n", sp.DeltaW)
+			ratio := sp.ReportingRatio
+			ratioNote := ""
+			switch {
+			case ratio >= 0.9:
+				ratioNote = "✓ GPU telemetry matches server power"
+			case ratio >= 0.75:
+				ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
+			default:
+				ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
+			}
+			fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
+		} else {
+			b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
+		}
+		for _, note := range sp.Notes {
+			fmt.Fprintf(&b, "\n> %s\n", note)
+		}
+		b.WriteString("\n")
+	}
+
 	if len(result.Findings) > 0 {
 		b.WriteString("## Summary\n\n")
 		for _, finding := range result.Findings {
@@ -3181,6 +3216,12 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
 			fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
 		}
 	}
+	if sp := result.ServerPower; sp != nil && sp.Available {
+		fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
+		fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW)
+		fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW)
+		fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio)
+	}
 	return b.String()
 }

@@ -3224,6 +3265,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)
 	_ = durationSec
+
+	// Sample IPMI idle power before any GPU load.
+	var serverIdleW float64
+	var serverIdleOK bool
+	if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
+		serverIdleW = w
+		serverIdleOK = true
+		logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+	}
+
 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
@@ -3320,6 +3371,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
 	stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))

+	// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
+	// server-side loaded power while GPUs are under stress. The goroutine is
+	// cancelled as soon as Phase 2 finishes, and the average is used to compare
+	// against PlatformMaxTDPW (GPU-reported stable limits sum).
+	var serverLoadedW float64
+	var serverLoadedOK bool
+	ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
+	ipmiPhase2Done := make(chan float64, 1)
+	go func() {
+		defer close(ipmiPhase2Done)
+		if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
+			ipmiPhase2Done <- w
+		}
+	}()
+
 	// Step 1: reuse single-card calibration result directly.
 	if len(result.RecommendedSlotOrder) > 0 {
 		firstIdx := result.RecommendedSlotOrder[0]
@@ -3416,6 +3482,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		result.RampSteps = append(result.RampSteps, ramp)
 	}

+	// Stop IPMI Phase 2 sampling and collect result.
+	ipmiPhase2Cancel()
+	if w, ok := <-ipmiPhase2Done; ok {
+		serverLoadedW = w
+		serverLoadedOK = true
+		logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
+	}
+
 	// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
 	for i := range result.GPUs {
 		if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
@@ -3428,6 +3502,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	for _, lim := range stableLimits {
 		result.PlatformMaxTDPW += float64(lim)
 	}
+
+	// Characterize server power from IPMI idle/loaded samples.
+	// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
+	// ReportingRatio = IPMI_delta / GPU_reported_sum:
+	//   ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
+	_ = serverIdleOK // used implicitly via characterizeServerPower
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("marshal power result: %w", err)
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -300,8 +300,12 @@ type NvidiaPowerBenchResult struct {
 	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
 	// cumulative thermal ramp. Represents the actual sustained power budget of
 	// this server under full GPU load. Use for rack power planning.
-	PlatformMaxTDPW float64  `json:"platform_max_tdp_w"`
-	Findings        []string `json:"findings,omitempty"`
+	PlatformMaxTDPW float64               `json:"platform_max_tdp_w"`
+	// ServerPower captures IPMI server power delta (idle→loaded) measured in
+	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
+	// actual wall-power draw as seen by the server's power supply.
+	ServerPower     *BenchmarkServerPower `json:"server_power,omitempty"`
+	Findings        []string              `json:"findings,omitempty"`
 	GPUs            []NvidiaPowerBenchGPU `json:"gpus"`
 }