diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index b6df699..5cc1426 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -3107,7 +3107,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) - fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW) + fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW) + if sp := result.ServerPower; sp != nil && sp.Available { + fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW) + fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio) + } + b.WriteString("\n") + // Server power comparison table. + if sp := result.ServerPower; sp != nil { + b.WriteString("## Server vs GPU Power Comparison\n\n") + b.WriteString("| Metric | Value |\n") + b.WriteString("|--------|-------|\n") + fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW) + if sp.Available { + fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW) + fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW) + fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | %.0f W |\n", sp.DeltaW) + ratio := sp.ReportingRatio + ratioNote := "" + switch { + case ratio >= 0.9: + ratioNote = "✓ GPU telemetry matches server power" + case ratio >= 0.75: + ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP" + default: + ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power" + } + fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote) + } else { + b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n") + } + for _, note := range sp.Notes { + fmt.Fprintf(&b, "\n> %s\n", note) + } + b.WriteString("\n") + } + if len(result.Findings) > 0 { b.WriteString("## Summary\n\n") for _, finding := range result.Findings { @@ -3181,6 +3216,12 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string { fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW) } } + if sp := result.ServerPower; sp != nil && sp.Available { + fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW) + fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW) + fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW) + fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio) + } return b.String() } @@ -3224,6 +3265,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } durationSec := powerBenchDurationSec(opts.Profile) _ = durationSec + + // Sample IPMI idle power before any GPU load. + var serverIdleW float64 + var serverIdleOK bool + if w, ok := sampleIPMIPowerSeries(ctx, 10); ok { + serverIdleW = w + serverIdleOK = true + logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) + } + // Phase 1: calibrate each GPU individually (sequentially, one at a time) to // establish a true single-card power baseline unaffected by neighbour heat. calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected)) @@ -3320,6 +3371,21 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // stableLimits accumulates GPU index → fixed stable limit (W) across steps. stableLimits := make(map[int]int, len(result.RecommendedSlotOrder)) + // Start an IPMI sampling goroutine that runs throughout Phase 2 to capture + // server-side loaded power while GPUs are under stress. The goroutine is + // cancelled as soon as Phase 2 finishes, and the average is used to compare + // against PlatformMaxTDPW (GPU-reported stable limits sum). + var serverLoadedW float64 + var serverLoadedOK bool + ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx) + ipmiPhase2Done := make(chan float64, 1) + go func() { + defer close(ipmiPhase2Done) + if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok { + ipmiPhase2Done <- w + } + }() + // Step 1: reuse single-card calibration result directly. if len(result.RecommendedSlotOrder) > 0 { firstIdx := result.RecommendedSlotOrder[0] @@ -3416,6 +3482,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N result.RampSteps = append(result.RampSteps, ramp) } + // Stop IPMI Phase 2 sampling and collect result. + ipmiPhase2Cancel() + if w, ok := <-ipmiPhase2Done; ok { + serverLoadedW = w + serverLoadedOK = true + logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w)) + } + // Populate StablePowerLimitW on each GPU entry from the accumulated stable limits. for i := range result.GPUs { if lim, ok := stableLimits[result.GPUs[i].Index]; ok { @@ -3428,6 +3502,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N for _, lim := range stableLimits { result.PlatformMaxTDPW += float64(lim) } + + // Characterize server power from IPMI idle/loaded samples. + // GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi). + // ReportingRatio = IPMI_delta / GPU_reported_sum: + // ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP. + _ = serverIdleOK // used implicitly via characterizeServerPower + result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK) resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { return "", fmt.Errorf("marshal power result: %w", err) diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index b02958c..8a366a9 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -300,8 +300,12 @@ type NvidiaPowerBenchResult struct { // PlatformMaxTDPW is the sum of per-GPU stable power limits found during the // cumulative thermal ramp. Represents the actual sustained power budget of // this server under full GPU load. Use for rack power planning. - PlatformMaxTDPW float64 `json:"platform_max_tdp_w"` - Findings []string `json:"findings,omitempty"` + PlatformMaxTDPW float64 `json:"platform_max_tdp_w"` + // ServerPower captures IPMI server power delta (idle→loaded) measured in + // parallel with the thermal ramp. Use to compare GPU-reported TDP against + // actual wall-power draw as seen by the server's power supply. + ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` + Findings []string `json:"findings,omitempty"` GPUs []NvidiaPowerBenchGPU `json:"gpus"` }