From 61c7abaa80f056911158b2fd60dc398e521686d0 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 19 Apr 2026 13:07:48 +0300 Subject: [PATCH] Add multi-source PSU power triangulation and per-slot distribution table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - collector/psu.go: export PSUSlotsFromSDR() reusing slot regex patterns; add isPSUInputPower/isPSUOutputPower helpers covering MSI/MLT/xFusion/HPE naming; add xFusion Power slot pattern; parseBoundedFloat for self-healing (rejects zero/negative/out-of-range sensor readings); default fallback treats unclassified PSU sensors as AC input - benchmark_types.go: BenchmarkPSUSlotPower struct; BenchmarkServerPower gains PSUInputIdle/Loaded, PSUOutputIdle/Loaded, PSUSlotReadingsIdle/Loaded, GPUSlotTotalW, DCMICoverageRatio fields - benchmark.go: sampleIPMISDRPowerSensors uses collector.PSUSlotsFromSDR instead of custom classifier; detectDCMIPartialCoverage replaces ramp heuristic — compares DCMI idle vs SDR PSU sum, flags <0.70 ratio as partial coverage; detectIPMISaturationFallback kept for servers without SDR PSU sensors; report gains PSU Load Distribution table (per-slot AC/DC idle vs loaded, Δ) Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/collector/psu.go | 100 +++- audit/internal/platform/benchmark.go | 502 ++++++++++++++++++++- audit/internal/platform/benchmark_types.go | 59 ++- 3 files changed, 624 insertions(+), 37 deletions(-) diff --git a/audit/internal/collector/psu.go b/audit/internal/collector/psu.go index 78005ed..280f2ba 100644 --- a/audit/internal/collector/psu.go +++ b/audit/internal/collector/psu.go @@ -160,11 +160,54 @@ type psuSDR struct { } var psuSlotPatterns = []*regexp.Regexp{ - regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), - regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), - regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), - regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), - regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), + regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2 + regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6 + regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1 + regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3 + regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1 + // Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …). + // Must be last: "power supply N" is already caught by the pattern above. + regexp.MustCompile(`(?i)\bpower([0-9]+)\b`), +} + +// psuInputPowerKeywords matches AC-input power sensor names across vendors: +// MSI: PSU1_POWER_IN, PSU1_PIN +// MLT: PSU1_PIN +// xFusion: (matched via default fallback — no explicit keyword) +// HPE: PS1 Input Power, PS1 Input Watts +func isPSUInputPower(name string) bool { + return strings.Contains(name, "input power") || + strings.Contains(name, "input watts") || + strings.Contains(name, "_pin") || + strings.Contains(name, " pin") || + strings.Contains(name, "_power_in") || + strings.Contains(name, "power_in") +} + +// isPSUOutputPower matches DC-output power sensor names across vendors: +// MSI: PSU1_POWER_OUT +// MLT: PSU1_POUT +// xFusion: PS1 POut +func isPSUOutputPower(name string) bool { + return strings.Contains(name, "output power") || + strings.Contains(name, "output watts") || + strings.Contains(name, "_pout") || + strings.Contains(name, " pout") || + strings.Contains(name, "_power_out") || + strings.Contains(name, "power_out") || + strings.Contains(name, "power supply bay") || + strings.Contains(name, "psu bay") +} + +// parseBoundedFloat parses a numeric value from an SDR value field and +// validates it is within (0, max]. Returns nil for zero, negative, or +// out-of-range values — these indicate missing/off/fault sensor readings. +func parseBoundedFloat(raw string, max float64) *float64 { + v := parseFloatPtr(raw) + if v == nil || *v <= 0 || *v > max { + return nil + } + return v } func parsePSUSDR(raw string) map[int]psuSDR { @@ -194,24 +237,59 @@ func parsePSUSDR(raw string) map[int]psuSDR { lowerName := strings.ToLower(name) switch { - case strings.Contains(lowerName, "input power"): - entry.inputPowerW = parseFloatPtr(value) - case strings.Contains(lowerName, "output power"): - entry.outputPowerW = parseFloatPtr(value) - case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"): - entry.outputPowerW = parseFloatPtr(value) + case isPSUInputPower(lowerName): + entry.inputPowerW = parseBoundedFloat(value, 6000) + case isPSUOutputPower(lowerName): + entry.outputPowerW = parseBoundedFloat(value, 6000) case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"): entry.inputVoltage = parseFloatPtr(value) case strings.Contains(lowerName, "temp"): entry.temperatureC = parseFloatPtr(value) case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"): entry.healthPct = parsePercentPtr(value) + default: + // Generic PSU power reading: sensor matched a slot pattern but carries + // no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as + // AC input if the value looks like wattage and no better data is set yet. + if entry.inputPowerW == nil { + entry.inputPowerW = parseBoundedFloat(value, 6000) + } } out[slot] = entry } return out } +// PSUSlotPower holds SDR power readings for one PSU slot. +// Slot key used by PSUSlotsFromSDR is the 0-based index string, +// matching HardwarePowerSupply.Slot in the audit schema. +type PSUSlotPower struct { + InputW *float64 `json:"input_w,omitempty"` + OutputW *float64 `json:"output_w,omitempty"` + Status string `json:"status,omitempty"` +} + +// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data +// using the same battle-tested slot patterns as the hardware audit collector. +// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN). +// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot. +func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower { + sdr := parsePSUSDR(sdrOutput) + if len(sdr) == 0 { + return nil + } + out := make(map[string]PSUSlotPower, len(sdr)) + for slot, entry := range sdr { + key := strconv.Itoa(slot - 1) // audit uses 0-based slot + out[key] = PSUSlotPower{ + InputW: entry.inputPowerW, + OutputW: entry.outputPowerW, + Status: entry.status, + } + } + return out +} + func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply { if len(sdr) == 0 { return nil diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 96a9ede..613f60f 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -1,6 +1,7 @@ package platform import ( + "bee/audit/internal/collector" "context" "encoding/csv" "encoding/json" @@ -2025,11 +2026,17 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { } } if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 { - if sp.ReportingRatio < 0.75 { + dcmiPartial := detectDCMIPartialCoverage(sp) + if sp.ReportingRatio < 0.75 && !dcmiPartial { findings = append(findings, fmt.Sprintf( "GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.", sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio, )) + } else if sp.ReportingRatio < 0.75 && dcmiPartial { + findings = append(findings, fmt.Sprintf( + "IPMI DCMI covers partial PSU set (DCMI/SDR coverage %.0f%%): ratio %.2f reflects DCMI under-reporting, not GPU inaccuracy. GPU telemetry is the reliable power source; use SDR-based ratio for server-side accuracy.", + sp.DCMICoverageRatio*100, sp.ReportingRatio, + )) } else if sp.ReportingRatio > 1.25 { findings = append(findings, fmt.Sprintf( "Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.", @@ -2216,6 +2223,66 @@ func maxInt(a, b int) int { return b } +// detectDCMIPartialCoverage returns true when IPMI DCMI under-reports actual +// server power by comparing DCMI readings against SDR PSUx_POWER_IN sensor sums. +// +// Primary check: DCMI_idle / SDR_PSU_IN_idle — most reliable because GPU load +// is zero, so both sources measure the same server state. A ratio below 0.7 +// means DCMI misses ≥30% of installed PSUs (e.g. 0.50 = sees 2 of 4 PSUs). +// +// Fallback: DCMI_loaded / SDR_PSU_IN_loaded — less precise (GPU load may +// affect different PSUs differently) but still useful when idle SDR is absent. +// +// Returns false when SDR data is unavailable (server has no PSUx_POWER_IN +// sensors); the heuristic detectIPMISaturationFallback is used in that case. +func detectDCMIPartialCoverage(sp *BenchmarkServerPower) bool { + if sp == nil || !sp.Available { + return false + } + if sp.PSUInputIdleW > 0 && sp.IdleW > 0 { + return sp.IdleW/sp.PSUInputIdleW < 0.7 + } + if sp.PSUInputLoadedW > 0 && sp.LoadedW > 0 { + return sp.LoadedW/sp.PSUInputLoadedW < 0.7 + } + return false +} + +// detectIPMISaturationFallback is the heuristic used when SDR PSU sensors are +// absent. It analyses the power ramp: if 2+ of the last 3 per-step incremental +// DCMI/GPU ratios fall below 25% of the first-step ratio, DCMI has likely +// plateaued while GPU load continued to grow (saturation proxy). +// Prefer detectDCMIPartialCoverage when SDR data is available. +func detectIPMISaturationFallback(steps []NvidiaPowerBenchStep) bool { + type pt struct{ incIPMI, incGPU float64 } + var pts []pt + for i := 1; i < len(steps); i++ { + if steps[i].ServerDeltaW <= 0 || steps[i-1].ServerDeltaW <= 0 { + continue + } + incIPMI := steps[i].ServerDeltaW - steps[i-1].ServerDeltaW + incGPU := steps[i].TotalObservedPowerW - steps[i-1].TotalObservedPowerW + if incGPU <= 0 { + continue + } + pts = append(pts, pt{incIPMI, incGPU}) + } + if len(pts) < 3 { + return false + } + refRatio := pts[0].incIPMI / pts[0].incGPU + if refRatio <= 0 { + return false + } + saturated := 0 + for _, p := range pts[len(pts)-3:] { + if p.incIPMI/p.incGPU < refRatio*0.25 { + saturated++ + } + } + return saturated >= 2 +} + // psuStatusSnapshot samples PSU health sensor states via // `ipmitool sdr type "Power Supply"`. Returns a map of sensor name → reading // string (e.g. "Presence detected", "Failure detected"). Returns nil when IPMI @@ -2276,6 +2343,148 @@ func diffPSUStatus(before, after map[string]string) []string { return issues } +// sdrPowerSnapshot holds per-source power sums from a single `ipmitool sdr` read. +type sdrPowerSnapshot struct { + PSUInW float64 // sum of PSU AC input across all slots + PSUOutW float64 // sum of PSU DC output across all slots + GPUSlotW float64 // sum of GPU slot/GPU power sensors + + // Per-slot PSU data from collector.PSUSlotsFromSDR — same slot keys as + // audit HardwarePowerSupply.Slot (0-based strings). + PSUSlots map[string]BenchmarkPSUSlotPower + + SkippedSensors []string // sensors rejected during self-healing +} + +// sdrSensor is a name+watts pair used for GPU slot self-healing filtering. +type sdrSensor struct { + name string + watts float64 +} + +// filterSensorGroup removes physically implausible readings from a group. +// Hard bounds: 0 < watts ≤ maxPerSensorW. Within groups of 2+ sensors, +// values more than 5× the group median are rejected as stuck/fault sensors. +func filterSensorGroup(sensors []sdrSensor, maxPerSensorW float64) (valid []sdrSensor, skipped []string) { + var inBounds []sdrSensor + for _, s := range sensors { + if s.watts <= 0 || s.watts > maxPerSensorW { + skipped = append(skipped, fmt.Sprintf("%s (%.0f W: out of range 0–%.0f W)", s.name, s.watts, maxPerSensorW)) + } else { + inBounds = append(inBounds, s) + } + } + if len(inBounds) < 2 { + return inBounds, skipped + } + vals := make([]float64, len(inBounds)) + for i, s := range inBounds { + vals[i] = s.watts + } + sort.Float64s(vals) + mid := len(vals) / 2 + var median float64 + if len(vals)%2 == 0 { + median = (vals[mid-1] + vals[mid]) / 2 + } else { + median = vals[mid] + } + for _, s := range inBounds { + if median > 0 && s.watts > median*5 { + skipped = append(skipped, fmt.Sprintf("%s (%.0f W: >5× median %.0f W, likely sensor fault)", s.name, s.watts, median)) + } else { + valid = append(valid, s) + } + } + return valid, skipped +} + +// sampleIPMISDRPowerSensors reads power sensors from `ipmitool sdr` in a single +// invocation and returns self-healed grouped sums. +// +// PSU identification delegates to collector.PSUSlotsFromSDR which uses the same +// slot-detection regexes as the hardware audit (PSU1_POWER_IN, PSU1_PIN, PS1 POut, +// Power1…). Self-healing: bounds checking + 5× median outlier rejection. +// +// GPU slot sensors (GPU_POWER_SLOTx, GPU1 Power, …) are classified separately +// since the audit collector does not track GPU PCIe slot power. +func sampleIPMISDRPowerSensors() sdrPowerSnapshot { + raw, err := exec.Command("ipmitool", "sdr").Output() + if err != nil || len(raw) == 0 { + return sdrPowerSnapshot{} + } + sdrStr := string(raw) + var snap sdrPowerSnapshot + + // ── PSU data via audit collector ───────────────────────────────────────── + // collector.PSUSlotsFromSDR handles all vendor naming variants and applies + // bounds checking inside parseBoundedFloat (0 < w ≤ 6000 W). + collectorSlots := collector.PSUSlotsFromSDR(sdrStr) + + // Convert to benchmark type and apply cross-slot median filtering. + var psuInSensors, psuOutSensors []sdrSensor + for slotKey, sp := range collectorSlots { + bsp := BenchmarkPSUSlotPower{Status: sp.Status} + if sp.InputW != nil { + bsp.InputW = sp.InputW + psuInSensors = append(psuInSensors, sdrSensor{name: "PSU-slot-" + slotKey, watts: *sp.InputW}) + } + if sp.OutputW != nil { + bsp.OutputW = sp.OutputW + psuOutSensors = append(psuOutSensors, sdrSensor{name: "PSU-slot-" + slotKey + "-out", watts: *sp.OutputW}) + } + if snap.PSUSlots == nil { + snap.PSUSlots = make(map[string]BenchmarkPSUSlotPower) + } + snap.PSUSlots[slotKey] = bsp + } + + // Apply cross-slot outlier filter and sum. + validIn, skIn := filterSensorGroup(psuInSensors, 6000) + for _, s := range validIn { + snap.PSUInW += s.watts + } + snap.SkippedSensors = append(snap.SkippedSensors, skIn...) + + validOut, skOut := filterSensorGroup(psuOutSensors, 6000) + for _, s := range validOut { + snap.PSUOutW += s.watts + } + snap.SkippedSensors = append(snap.SkippedSensors, skOut...) + + // ── GPU slot sensors ───────────────────────────────────────────────────── + // collector does not track GPU PCIe slot power; classify here. + // Matches: GPU_POWER_SLOTx (MSI), GPU1 Power (xFusion), GPU_PWR_x (generic). + var gpuSensors []sdrSensor + for _, line := range strings.Split(sdrStr, "\n") { + parts := strings.Split(line, "|") + if len(parts) < 2 { + continue + } + name := strings.TrimSpace(parts[0]) + nameLower := strings.ToLower(name) + if !strings.Contains(nameLower, "gpu") { + continue + } + if !strings.Contains(nameLower, "slot") && !strings.Contains(nameLower, "power") && + !strings.Contains(nameLower, "pwr") { + continue + } + var w float64 + if n, _ := fmt.Sscanf(strings.TrimSpace(parts[1]), "%f Watts", &w); n != 1 { + continue + } + gpuSensors = append(gpuSensors, sdrSensor{name: name, watts: w}) + } + validGPU, skGPU := filterSensorGroup(gpuSensors, 2000) + for _, s := range validGPU { + snap.GPUSlotW += s.watts + } + snap.SkippedSensors = append(snap.SkippedSensors, skGPU...) + + return snap +} + // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi. // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed. func queryIPMIServerPowerW() (float64, error) { @@ -3255,23 +3464,51 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW) if sp := result.ServerPower; sp != nil && sp.Available { - fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW) - fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio) + fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW) + if sp.PSUInputLoadedW > 0 { + psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW + fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta) + } + fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio) } b.WriteString("\n") // Server power comparison table. if sp := result.ServerPower; sp != nil { b.WriteString("## Server vs GPU Power Comparison\n\n") - b.WriteString("| Metric | Value |\n") - b.WriteString("|--------|-------|\n") - fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW) + b.WriteString("| Metric | Source | Value |\n") + b.WriteString("|--------|--------|-------|\n") + fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW) + fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW) + if sp.GPUSlotTotalW > 0 { + fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW) + } + if sp.Available { + fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW) + fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW) + fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW) + } + if sp.PSUInputLoadedW > 0 { + fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW) + fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW) + psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW + fmt.Fprintf(&b, "| PSU AC input Δ (loaded − idle) | IPMI SDR | %.0f W |\n", psuDelta) + } + if sp.PSUOutputLoadedW > 0 { + fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW) + fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW) + if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 { + psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100 + fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff) + } + } if sp.Available { - fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW) - fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW) - fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | %.0f W |\n", sp.DeltaW) ratio := sp.ReportingRatio + dcmiPartial := detectDCMIPartialCoverage(sp) || + (sp.PSUInputIdleW == 0 && detectIPMISaturationFallback(result.RampSteps)) ratioNote := "" switch { + case dcmiPartial: + ratioNote = "⚠ IPMI DCMI covers partial PSU set; use SDR ratio below for accuracy assessment" case ratio >= 0.9: ratioNote = "✓ GPU telemetry matches server power" case ratio >= 0.75: @@ -3279,14 +3516,83 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { default: ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power" } - fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote) + fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote) + if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 { + psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW + sdrRatio := psuDelta / sp.GPUReportedSumW + sdrNote := "" + switch { + case sdrRatio >= 0.9: + sdrNote = "✓ GPU telemetry matches wall power" + case sdrRatio >= 0.75: + sdrNote = "⚠ minor discrepancy" + default: + sdrNote = "✗ significant discrepancy" + } + fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote) + } } else { - b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n") + b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n") } for _, note := range sp.Notes { fmt.Fprintf(&b, "\n> %s\n", note) } b.WriteString("\n") + + if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 { + b.WriteString("## PSU Load Distribution\n\n") + b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n") + b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n") + + // collect all slot keys + slotSet := map[string]struct{}{} + for k := range sp.PSUSlotReadingsIdle { + slotSet[k] = struct{}{} + } + for k := range sp.PSUSlotReadingsLoaded { + slotSet[k] = struct{}{} + } + slots := make([]string, 0, len(slotSet)) + for k := range slotSet { + slots = append(slots, k) + } + sort.Strings(slots) + + for _, slot := range slots { + idle := sp.PSUSlotReadingsIdle[slot] + loaded := sp.PSUSlotReadingsLoaded[slot] + + fmtW := func(v *float64) string { + if v == nil { + return "—" + } + return fmt.Sprintf("%.0f W", *v) + } + + var deltaStr string + if idle.InputW != nil && loaded.InputW != nil { + deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW) + } else { + deltaStr = "—" + } + + status := loaded.Status + if status == "" { + status = idle.Status + } + if status == "" { + status = "—" + } + + fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n", + slot, + fmtW(idle.InputW), fmtW(loaded.InputW), + fmtW(idle.OutputW), fmtW(loaded.OutputW), + deltaStr, status, + ) + } + b.WriteString("\n") + } } if len(result.Findings) > 0 { @@ -3318,6 +3624,130 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } b.WriteString("\n") } + // ── PSU Issues ──────────────────────────────────────────────────────────── + if len(result.PSUIssues) > 0 { + b.WriteString("## PSU Issues\n\n") + b.WriteString("The following power supply anomalies were detected during the test:\n\n") + for _, issue := range result.PSUIssues { + fmt.Fprintf(&b, "- ⛔ %s\n", issue) + } + b.WriteString("\n") + } + + // ── Power Distribution Summary ──────────────────────────────────────────── + b.WriteString("## Power Distribution Summary\n\n") + { + var totalDefault, totalStable float64 + for _, gpu := range result.GPUs { + stable := gpu.StablePowerLimitW + if stable <= 0 { + stable = gpu.AppliedPowerLimitW + } + totalDefault += gpu.DefaultPowerLimitW + totalStable += stable + } + b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n") + b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n") + for _, gpu := range result.GPUs { + stable := gpu.StablePowerLimitW + if stable <= 0 { + stable = gpu.AppliedPowerLimitW + } + realization := "-" + if gpu.DefaultPowerLimitW > 0 && stable > 0 { + realization = fmt.Sprintf("%.1f%%", stable/gpu.DefaultPowerLimitW*100) + } + derated := "-" + if gpu.Derated { + derated = "⚠ yes" + } + fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n", + gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated) + } + platformReal := "-" + if totalDefault > 0 && totalStable > 0 { + platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100) + } + fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n", + totalDefault, totalStable, platformReal) + + // Balance across GPUs — only meaningful with 2+ GPUs. + if len(result.GPUs) > 1 { + var minS, maxS, sumS float64 + var cnt int + for _, gpu := range result.GPUs { + s := gpu.StablePowerLimitW + if s <= 0 { + s = gpu.AppliedPowerLimitW + } + if s <= 0 { + continue + } + sumS += s + cnt++ + if cnt == 1 || s < minS { + minS = s + } + if s > maxS { + maxS = s + } + } + if cnt > 0 { + avg := sumS / float64(cnt) + spread := (maxS - minS) / avg * 100 + balanceNote := "✓ balanced" + switch { + case spread > 20: + balanceNote = "⚠ significant imbalance — check slot thermals" + case spread > 10: + balanceNote = "— minor imbalance" + } + fmt.Fprintf(&b, "**GPU power balance:** avg %.0f W · min %.0f W · max %.0f W · spread %.1f%% — %s\n\n", + avg, minS, maxS, spread, balanceNote) + } + } + + // Ramp scalability table — power efficiency of adding each GPU. + if len(result.RampSteps) > 1 { + b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n") + b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n") + b.WriteString("|------|------|-----------------------|-------------|---------------------|\n") + // First GPU stable TDP as the reference unit for efficiency. + var firstStable float64 + if len(result.GPUs) > 0 { + firstStable = result.GPUs[0].StablePowerLimitW + if firstStable <= 0 { + firstStable = result.GPUs[0].AppliedPowerLimitW + } + } + var prevCumulative float64 + for _, step := range result.RampSteps { + var cumulative float64 + for _, gpuIdx := range step.GPUIndices { + for _, g := range result.GPUs { + if g.Index != gpuIdx { + continue + } + s := g.StablePowerLimitW + if s <= 0 { + s = g.AppliedPowerLimitW + } + cumulative += s + } + } + incremental := cumulative - prevCumulative + efficiency := "—" + if step.StepIndex > 1 && firstStable > 0 { + efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100) + } + fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n", + step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency) + prevCumulative = cumulative + } + b.WriteString("\n") + } + } + b.WriteString("## Per-Slot Results\n\n") b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n") b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n") @@ -3440,6 +3870,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N serverIdleOK = true logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) } + sdrIdle := sampleIPMISDRPowerSensors() psuBefore := psuStatusSnapshot() // Phase 1: calibrate each GPU individually (sequentially, one at a time) to @@ -3753,11 +4184,52 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } // Characterize server power from IPMI idle/loaded samples. - // GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi). - // ReportingRatio = IPMI_delta / GPU_reported_sum: - // ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP. + // gpuActualSumW = sum of p95 GPU power from the last ramp step — actual + // measured consumption, not the stable limit cap. This is the correct + // denominator for the reporting ratio: limit caps (PlatformMaxTDPW) inflate + // the denominator and make the ratio appear artificially low. + var gpuActualSumW float64 + if n := len(result.RampSteps); n > 0 { + gpuActualSumW = result.RampSteps[n-1].TotalObservedPowerW + } + if gpuActualSumW <= 0 { + gpuActualSumW = result.PlatformMaxTDPW + } _ = serverIdleOK // used implicitly via characterizeServerPower - result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK) + result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK) + // Supplement DCMI with SDR multi-source data via collector's PSU slot patterns. + // Per-slot readings enable correlation with audit HardwarePowerSupply entries. + if result.ServerPower != nil { + sdrLoaded := sampleIPMISDRPowerSensors() + result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW + result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW + result.ServerPower.PSUOutputIdleW = sdrIdle.PSUOutW + result.ServerPower.PSUOutputLoadedW = sdrLoaded.PSUOutW + result.ServerPower.GPUSlotTotalW = sdrLoaded.GPUSlotW + if len(sdrIdle.PSUSlots) > 0 { + result.ServerPower.PSUSlotReadingsIdle = sdrIdle.PSUSlots + } + if len(sdrLoaded.PSUSlots) > 0 { + result.ServerPower.PSUSlotReadingsLoaded = sdrLoaded.PSUSlots + } + if sdrIdle.PSUInW > 0 && result.ServerPower.IdleW > 0 { + result.ServerPower.DCMICoverageRatio = result.ServerPower.IdleW / sdrIdle.PSUInW + } + if len(sdrLoaded.SkippedSensors) > 0 { + result.ServerPower.Notes = append(result.ServerPower.Notes, + "SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; ")) + } + // Detect DCMI partial coverage: direct SDR comparison first, + // ramp heuristic as fallback when SDR PSU sensors are absent. + dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) || + (sdrIdle.PSUInW == 0 && detectIPMISaturationFallback(result.RampSteps)) + if dcmiUnreliable { + result.ServerPower.Notes = append(result.ServerPower.Notes, + fmt.Sprintf("IPMI DCMI covers only a subset of installed PSUs (coverage %.0f%%). "+ + "Use SDR PSU Δ ratio for GPU accuracy assessment; DCMI ratio is not reliable.", + result.ServerPower.DCMICoverageRatio*100)) + } + } result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot()) // Write top-level gpu-metrics.csv/.html aggregating all phases. writeBenchmarkMetricsFiles(runDir, allPowerRows) diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index a1caaf4..0b60bcb 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -275,18 +275,55 @@ type BenchmarkScorecard struct { TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"` } -// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported -// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power -// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is -// over-reporting its power consumption. +// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled +// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based) +// so benchmark and audit data can be correlated by slot. +type BenchmarkPSUSlotPower struct { + InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN) + OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT) + Status string `json:"status,omitempty"` +} + +// BenchmarkServerPower captures server-side power from multiple independent +// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and +// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI +// covers only a subset of installed PSUs (partial coverage). +// +// Source legend: +// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs +// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable +// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load type BenchmarkServerPower struct { - Available bool `json:"available"` - IdleW float64 `json:"idle_w,omitempty"` - LoadedW float64 `json:"loaded_w,omitempty"` - DeltaW float64 `json:"delta_w,omitempty"` - GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"` - ReportingRatio float64 `json:"reporting_ratio,omitempty"` - Notes []string `json:"notes,omitempty"` + Available bool `json:"available"` + IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle + LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load + DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle + GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"` + ReportingRatio float64 `json:"reporting_ratio,omitempty"` + + // PSU AC input sum — sampled at idle and at peak load using collector's + // slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…). + PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"` + PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"` + + // PSU DC output sum — power delivered to server internals after conversion. + PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"` + PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"` + + // Per-slot PSU readings at idle and at peak load. + // Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot. + PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"` + PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"` + + // GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load. + // PCIe slot delivery only (excludes 16-pin connector power). + GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"` + + // DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle. + // Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs. + DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"` + + Notes []string `json:"notes,omitempty"` } // BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected