Add multi-source PSU power triangulation and per-slot distribution table

- collector/psu.go: export PSUSlotsFromSDR() reusing slot regex patterns; add isPSUInputPower/isPSUOutputPower helpers covering MSI/MLT/xFusion/HPE naming; add xFusion Power<N> slot pattern; parseBoundedFloat for self-healing (rejects zero/negative/out-of-range sensor readings); default fallback treats unclassified PSU sensors as AC input - benchmark_types.go: BenchmarkPSUSlotPower struct; BenchmarkServerPower gains PSUInputIdle/Loaded, PSUOutputIdle/Loaded, PSUSlotReadingsIdle/Loaded, GPUSlotTotalW, DCMICoverageRatio fields - benchmark.go: sampleIPMISDRPowerSensors uses collector.PSUSlotsFromSDR instead of custom classifier; detectDCMIPartialCoverage replaces ramp heuristic — compares DCMI idle vs SDR PSU sum, flags <0.70 ratio as partial coverage; detectIPMISaturationFallback kept for servers without SDR PSU sensors; report gains PSU Load Distribution table (per-slot AC/DC idle vs loaded, Δ) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 13:07:48 +03:00
parent d60f7758ba
commit 61c7abaa80
3 changed files with 624 additions and 37 deletions
--- a/audit/internal/collector/psu.go
+++ b/audit/internal/collector/psu.go
@@ -160,11 +160,54 @@ type psuSDR struct {
 }

 var psuSlotPatterns = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
+	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),                    // PSU1, PS1, ps 2
+	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),                      // PS 6, PS6
+	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),                     // PWS1
+	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
+	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),                     // Bay 1
+	// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
+	// Must be last: "power supply N" is already caught by the pattern above.
+	regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
+}
+
+// psuInputPowerKeywords matches AC-input power sensor names across vendors:
+//   MSI:     PSU1_POWER_IN, PSU1_PIN
+//   MLT:     PSU1_PIN
+//   xFusion: (matched via default fallback — no explicit keyword)
+//   HPE:     PS1 Input Power, PS1 Input Watts
+func isPSUInputPower(name string) bool {
+	return strings.Contains(name, "input power") ||
+		strings.Contains(name, "input watts") ||
+		strings.Contains(name, "_pin") ||
+		strings.Contains(name, " pin") ||
+		strings.Contains(name, "_power_in") ||
+		strings.Contains(name, "power_in")
+}
+
+// isPSUOutputPower matches DC-output power sensor names across vendors:
+//   MSI:     PSU1_POWER_OUT
+//   MLT:     PSU1_POUT
+//   xFusion: PS1 POut
+func isPSUOutputPower(name string) bool {
+	return strings.Contains(name, "output power") ||
+		strings.Contains(name, "output watts") ||
+		strings.Contains(name, "_pout") ||
+		strings.Contains(name, " pout") ||
+		strings.Contains(name, "_power_out") ||
+		strings.Contains(name, "power_out") ||
+		strings.Contains(name, "power supply bay") ||
+		strings.Contains(name, "psu bay")
+}
+
+// parseBoundedFloat parses a numeric value from an SDR value field and
+// validates it is within (0, max]. Returns nil for zero, negative, or
+// out-of-range values — these indicate missing/off/fault sensor readings.
+func parseBoundedFloat(raw string, max float64) *float64 {
+	v := parseFloatPtr(raw)
+	if v == nil || *v <= 0 || *v > max {
+		return nil
+	}
+	return v
 }

 func parsePSUSDR(raw string) map[int]psuSDR {
@@ -194,24 +237,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {

 		lowerName := strings.ToLower(name)
 		switch {
-		case strings.Contains(lowerName, "input power"):
-			entry.inputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "output power"):
-			entry.outputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
-			entry.outputPowerW = parseFloatPtr(value)
+		case isPSUInputPower(lowerName):
+			entry.inputPowerW = parseBoundedFloat(value, 6000)
+		case isPSUOutputPower(lowerName):
+			entry.outputPowerW = parseBoundedFloat(value, 6000)
 		case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
 			entry.inputVoltage = parseFloatPtr(value)
 		case strings.Contains(lowerName, "temp"):
 			entry.temperatureC = parseFloatPtr(value)
 		case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
 			entry.healthPct = parsePercentPtr(value)
+		default:
+			// Generic PSU power reading: sensor matched a slot pattern but carries
+			// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
+			// AC input if the value looks like wattage and no better data is set yet.
+			if entry.inputPowerW == nil {
+				entry.inputPowerW = parseBoundedFloat(value, 6000)
+			}
 		}
 		out[slot] = entry
 	}
 	return out
 }

+// PSUSlotPower holds SDR power readings for one PSU slot.
+// Slot key used by PSUSlotsFromSDR is the 0-based index string,
+// matching HardwarePowerSupply.Slot in the audit schema.
+type PSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`
+	OutputW *float64 `json:"output_w,omitempty"`
+	Status  string   `json:"status,omitempty"`
+}
+
+// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
+// using the same battle-tested slot patterns as the hardware audit collector.
+// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
+// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
+func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
+	sdr := parsePSUSDR(sdrOutput)
+	if len(sdr) == 0 {
+		return nil
+	}
+	out := make(map[string]PSUSlotPower, len(sdr))
+	for slot, entry := range sdr {
+		key := strconv.Itoa(slot - 1) // audit uses 0-based slot
+		out[key] = PSUSlotPower{
+			InputW:  entry.inputPowerW,
+			OutputW: entry.outputPowerW,
+			Status:  entry.status,
+		}
+	}
+	return out
+}
+
 func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
 	if len(sdr) == 0 {
 		return nil
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -1,6 +1,7 @@
 package platform

 import (
+	"bee/audit/internal/collector"
 	"context"
 	"encoding/csv"
 	"encoding/json"
@@ -2025,11 +2026,17 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 		}
 	}
 	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
-		if sp.ReportingRatio < 0.75 {
+		dcmiPartial := detectDCMIPartialCoverage(sp)
+		if sp.ReportingRatio < 0.75 && !dcmiPartial {
 			findings = append(findings, fmt.Sprintf(
 				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
 				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
 			))
+		} else if sp.ReportingRatio < 0.75 && dcmiPartial {
+			findings = append(findings, fmt.Sprintf(
+				"IPMI DCMI covers partial PSU set (DCMI/SDR coverage %.0f%%): ratio %.2f reflects DCMI under-reporting, not GPU inaccuracy. GPU telemetry is the reliable power source; use SDR-based ratio for server-side accuracy.",
+				sp.DCMICoverageRatio*100, sp.ReportingRatio,
+			))
 		} else if sp.ReportingRatio > 1.25 {
 			findings = append(findings, fmt.Sprintf(
 				"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
@@ -2216,6 +2223,66 @@ func maxInt(a, b int) int {
 	return b
 }

+// detectDCMIPartialCoverage returns true when IPMI DCMI under-reports actual
+// server power by comparing DCMI readings against SDR PSUx_POWER_IN sensor sums.
+//
+// Primary check: DCMI_idle / SDR_PSU_IN_idle — most reliable because GPU load
+// is zero, so both sources measure the same server state. A ratio below 0.7
+// means DCMI misses ≥30% of installed PSUs (e.g. 0.50 = sees 2 of 4 PSUs).
+//
+// Fallback: DCMI_loaded / SDR_PSU_IN_loaded — less precise (GPU load may
+// affect different PSUs differently) but still useful when idle SDR is absent.
+//
+// Returns false when SDR data is unavailable (server has no PSUx_POWER_IN
+// sensors); the heuristic detectIPMISaturationFallback is used in that case.
+func detectDCMIPartialCoverage(sp *BenchmarkServerPower) bool {
+	if sp == nil || !sp.Available {
+		return false
+	}
+	if sp.PSUInputIdleW > 0 && sp.IdleW > 0 {
+		return sp.IdleW/sp.PSUInputIdleW < 0.7
+	}
+	if sp.PSUInputLoadedW > 0 && sp.LoadedW > 0 {
+		return sp.LoadedW/sp.PSUInputLoadedW < 0.7
+	}
+	return false
+}
+
+// detectIPMISaturationFallback is the heuristic used when SDR PSU sensors are
+// absent. It analyses the power ramp: if 2+ of the last 3 per-step incremental
+// DCMI/GPU ratios fall below 25% of the first-step ratio, DCMI has likely
+// plateaued while GPU load continued to grow (saturation proxy).
+// Prefer detectDCMIPartialCoverage when SDR data is available.
+func detectIPMISaturationFallback(steps []NvidiaPowerBenchStep) bool {
+	type pt struct{ incIPMI, incGPU float64 }
+	var pts []pt
+	for i := 1; i < len(steps); i++ {
+		if steps[i].ServerDeltaW <= 0 || steps[i-1].ServerDeltaW <= 0 {
+			continue
+		}
+		incIPMI := steps[i].ServerDeltaW - steps[i-1].ServerDeltaW
+		incGPU := steps[i].TotalObservedPowerW - steps[i-1].TotalObservedPowerW
+		if incGPU <= 0 {
+			continue
+		}
+		pts = append(pts, pt{incIPMI, incGPU})
+	}
+	if len(pts) < 3 {
+		return false
+	}
+	refRatio := pts[0].incIPMI / pts[0].incGPU
+	if refRatio <= 0 {
+		return false
+	}
+	saturated := 0
+	for _, p := range pts[len(pts)-3:] {
+		if p.incIPMI/p.incGPU < refRatio*0.25 {
+			saturated++
+		}
+	}
+	return saturated >= 2
+}
+
 // psuStatusSnapshot samples PSU health sensor states via
 // `ipmitool sdr type "Power Supply"`. Returns a map of sensor name → reading
 // string (e.g. "Presence detected", "Failure detected"). Returns nil when IPMI
@@ -2276,6 +2343,148 @@ func diffPSUStatus(before, after map[string]string) []string {
 	return issues
 }

+// sdrPowerSnapshot holds per-source power sums from a single `ipmitool sdr` read.
+type sdrPowerSnapshot struct {
+	PSUInW   float64 // sum of PSU AC input across all slots
+	PSUOutW  float64 // sum of PSU DC output across all slots
+	GPUSlotW float64 // sum of GPU slot/GPU power sensors
+
+	// Per-slot PSU data from collector.PSUSlotsFromSDR — same slot keys as
+	// audit HardwarePowerSupply.Slot (0-based strings).
+	PSUSlots map[string]BenchmarkPSUSlotPower
+
+	SkippedSensors []string // sensors rejected during self-healing
+}
+
+// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
+type sdrSensor struct {
+	name  string
+	watts float64
+}
+
+// filterSensorGroup removes physically implausible readings from a group.
+// Hard bounds: 0 < watts ≤ maxPerSensorW. Within groups of 2+ sensors,
+// values more than 5× the group median are rejected as stuck/fault sensors.
+func filterSensorGroup(sensors []sdrSensor, maxPerSensorW float64) (valid []sdrSensor, skipped []string) {
+	var inBounds []sdrSensor
+	for _, s := range sensors {
+		if s.watts <= 0 || s.watts > maxPerSensorW {
+			skipped = append(skipped, fmt.Sprintf("%s (%.0f W: out of range 0–%.0f W)", s.name, s.watts, maxPerSensorW))
+		} else {
+			inBounds = append(inBounds, s)
+		}
+	}
+	if len(inBounds) < 2 {
+		return inBounds, skipped
+	}
+	vals := make([]float64, len(inBounds))
+	for i, s := range inBounds {
+		vals[i] = s.watts
+	}
+	sort.Float64s(vals)
+	mid := len(vals) / 2
+	var median float64
+	if len(vals)%2 == 0 {
+		median = (vals[mid-1] + vals[mid]) / 2
+	} else {
+		median = vals[mid]
+	}
+	for _, s := range inBounds {
+		if median > 0 && s.watts > median*5 {
+			skipped = append(skipped, fmt.Sprintf("%s (%.0f W: >5× median %.0f W, likely sensor fault)", s.name, s.watts, median))
+		} else {
+			valid = append(valid, s)
+		}
+	}
+	return valid, skipped
+}
+
+// sampleIPMISDRPowerSensors reads power sensors from `ipmitool sdr` in a single
+// invocation and returns self-healed grouped sums.
+//
+// PSU identification delegates to collector.PSUSlotsFromSDR which uses the same
+// slot-detection regexes as the hardware audit (PSU1_POWER_IN, PSU1_PIN, PS1 POut,
+// Power1…). Self-healing: bounds checking + 5× median outlier rejection.
+//
+// GPU slot sensors (GPU_POWER_SLOTx, GPU1 Power, …) are classified separately
+// since the audit collector does not track GPU PCIe slot power.
+func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
+	raw, err := exec.Command("ipmitool", "sdr").Output()
+	if err != nil || len(raw) == 0 {
+		return sdrPowerSnapshot{}
+	}
+	sdrStr := string(raw)
+	var snap sdrPowerSnapshot
+
+	// ── PSU data via audit collector ─────────────────────────────────────────
+	// collector.PSUSlotsFromSDR handles all vendor naming variants and applies
+	// bounds checking inside parseBoundedFloat (0 < w ≤ 6000 W).
+	collectorSlots := collector.PSUSlotsFromSDR(sdrStr)
+
+	// Convert to benchmark type and apply cross-slot median filtering.
+	var psuInSensors, psuOutSensors []sdrSensor
+	for slotKey, sp := range collectorSlots {
+		bsp := BenchmarkPSUSlotPower{Status: sp.Status}
+		if sp.InputW != nil {
+			bsp.InputW = sp.InputW
+			psuInSensors = append(psuInSensors, sdrSensor{name: "PSU-slot-" + slotKey, watts: *sp.InputW})
+		}
+		if sp.OutputW != nil {
+			bsp.OutputW = sp.OutputW
+			psuOutSensors = append(psuOutSensors, sdrSensor{name: "PSU-slot-" + slotKey + "-out", watts: *sp.OutputW})
+		}
+		if snap.PSUSlots == nil {
+			snap.PSUSlots = make(map[string]BenchmarkPSUSlotPower)
+		}
+		snap.PSUSlots[slotKey] = bsp
+	}
+
+	// Apply cross-slot outlier filter and sum.
+	validIn, skIn := filterSensorGroup(psuInSensors, 6000)
+	for _, s := range validIn {
+		snap.PSUInW += s.watts
+	}
+	snap.SkippedSensors = append(snap.SkippedSensors, skIn...)
+
+	validOut, skOut := filterSensorGroup(psuOutSensors, 6000)
+	for _, s := range validOut {
+		snap.PSUOutW += s.watts
+	}
+	snap.SkippedSensors = append(snap.SkippedSensors, skOut...)
+
+	// ── GPU slot sensors ─────────────────────────────────────────────────────
+	// collector does not track GPU PCIe slot power; classify here.
+	// Matches: GPU_POWER_SLOTx (MSI), GPU1 Power (xFusion), GPU_PWR_x (generic).
+	var gpuSensors []sdrSensor
+	for _, line := range strings.Split(sdrStr, "\n") {
+		parts := strings.Split(line, "|")
+		if len(parts) < 2 {
+			continue
+		}
+		name := strings.TrimSpace(parts[0])
+		nameLower := strings.ToLower(name)
+		if !strings.Contains(nameLower, "gpu") {
+			continue
+		}
+		if !strings.Contains(nameLower, "slot") && !strings.Contains(nameLower, "power") &&
+			!strings.Contains(nameLower, "pwr") {
+			continue
+		}
+		var w float64
+		if n, _ := fmt.Sscanf(strings.TrimSpace(parts[1]), "%f Watts", &w); n != 1 {
+			continue
+		}
+		gpuSensors = append(gpuSensors, sdrSensor{name: name, watts: w})
+	}
+	validGPU, skGPU := filterSensorGroup(gpuSensors, 2000)
+	for _, s := range validGPU {
+		snap.GPUSlotW += s.watts
+	}
+	snap.SkippedSensors = append(snap.SkippedSensors, skGPU...)
+
+	return snap
+}
+
 // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
 // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
 func queryIPMIServerPowerW() (float64, error) {
@@ -3255,23 +3464,51 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W  \n", result.PlatformMaxTDPW)
 	if sp := result.ServerPower; sp != nil && sp.Available {
-		fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W  \n", sp.DeltaW)
-		fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f  \n", sp.ReportingRatio)
+		fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W  \n", sp.DeltaW)
+		if sp.PSUInputLoadedW > 0 {
+			psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
+			fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W  \n", psuDelta)
+		}
+		fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f  \n", sp.ReportingRatio)
 	}
 	b.WriteString("\n")
 	// Server power comparison table.
 	if sp := result.ServerPower; sp != nil {
 		b.WriteString("## Server vs GPU Power Comparison\n\n")
-		b.WriteString("| Metric | Value |\n")
-		b.WriteString("|--------|-------|\n")
-		fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
+		b.WriteString("| Metric | Source | Value |\n")
+		b.WriteString("|--------|--------|-------|\n")
+		fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW)
+		fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW)
+		if sp.GPUSlotTotalW > 0 {
+			fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW)
+		}
+		if sp.Available {
+			fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW)
+			fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW)
+			fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW)
+		}
+		if sp.PSUInputLoadedW > 0 {
+			fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW)
+			fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW)
+			psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
+			fmt.Fprintf(&b, "| PSU AC input Δ (loaded − idle) | IPMI SDR | %.0f W |\n", psuDelta)
+		}
+		if sp.PSUOutputLoadedW > 0 {
+			fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW)
+			fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW)
+			if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
+				psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
+				fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff)
+			}
+		}
 		if sp.Available {
-			fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
-			fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
-			fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | %.0f W |\n", sp.DeltaW)
 			ratio := sp.ReportingRatio
+			dcmiPartial := detectDCMIPartialCoverage(sp) ||
+				(sp.PSUInputIdleW == 0 && detectIPMISaturationFallback(result.RampSteps))
 			ratioNote := ""
 			switch {
+			case dcmiPartial:
+				ratioNote = "⚠ IPMI DCMI covers partial PSU set; use SDR ratio below for accuracy assessment"
 			case ratio >= 0.9:
 				ratioNote = "✓ GPU telemetry matches server power"
 			case ratio >= 0.75:
@@ -3279,14 +3516,83 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			default:
 				ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
 			}
-			fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
+			fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote)
+			if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
+				psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
+				sdrRatio := psuDelta / sp.GPUReportedSumW
+				sdrNote := ""
+				switch {
+				case sdrRatio >= 0.9:
+					sdrNote = "✓ GPU telemetry matches wall power"
+				case sdrRatio >= 0.75:
+					sdrNote = "⚠ minor discrepancy"
+				default:
+					sdrNote = "✗ significant discrepancy"
+				}
+				fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote)
+			}
 		} else {
-			b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
+			b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n")
 		}
 		for _, note := range sp.Notes {
 			fmt.Fprintf(&b, "\n> %s\n", note)
 		}
 		b.WriteString("\n")
+
+		if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
+			b.WriteString("## PSU Load Distribution\n\n")
+			b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n")
+			b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n")
+
+			// collect all slot keys
+			slotSet := map[string]struct{}{}
+			for k := range sp.PSUSlotReadingsIdle {
+				slotSet[k] = struct{}{}
+			}
+			for k := range sp.PSUSlotReadingsLoaded {
+				slotSet[k] = struct{}{}
+			}
+			slots := make([]string, 0, len(slotSet))
+			for k := range slotSet {
+				slots = append(slots, k)
+			}
+			sort.Strings(slots)
+
+			for _, slot := range slots {
+				idle := sp.PSUSlotReadingsIdle[slot]
+				loaded := sp.PSUSlotReadingsLoaded[slot]
+
+				fmtW := func(v *float64) string {
+					if v == nil {
+						return "—"
+					}
+					return fmt.Sprintf("%.0f W", *v)
+				}
+
+				var deltaStr string
+				if idle.InputW != nil && loaded.InputW != nil {
+					deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
+				} else {
+					deltaStr = "—"
+				}
+
+				status := loaded.Status
+				if status == "" {
+					status = idle.Status
+				}
+				if status == "" {
+					status = "—"
+				}
+
+				fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n",
+					slot,
+					fmtW(idle.InputW), fmtW(loaded.InputW),
+					fmtW(idle.OutputW), fmtW(loaded.OutputW),
+					deltaStr, status,
+				)
+			}
+			b.WriteString("\n")
+		}
 	}

 	if len(result.Findings) > 0 {
@@ -3318,6 +3624,130 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 		}
 		b.WriteString("\n")
 	}
+	// ── PSU Issues ────────────────────────────────────────────────────────────
+	if len(result.PSUIssues) > 0 {
+		b.WriteString("## PSU Issues\n\n")
+		b.WriteString("The following power supply anomalies were detected during the test:\n\n")
+		for _, issue := range result.PSUIssues {
+			fmt.Fprintf(&b, "- ⛔ %s\n", issue)
+		}
+		b.WriteString("\n")
+	}
+
+	// ── Power Distribution Summary ────────────────────────────────────────────
+	b.WriteString("## Power Distribution Summary\n\n")
+	{
+		var totalDefault, totalStable float64
+		for _, gpu := range result.GPUs {
+			stable := gpu.StablePowerLimitW
+			if stable <= 0 {
+				stable = gpu.AppliedPowerLimitW
+			}
+			totalDefault += gpu.DefaultPowerLimitW
+			totalStable += stable
+		}
+		b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n")
+		b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n")
+		for _, gpu := range result.GPUs {
+			stable := gpu.StablePowerLimitW
+			if stable <= 0 {
+				stable = gpu.AppliedPowerLimitW
+			}
+			realization := "-"
+			if gpu.DefaultPowerLimitW > 0 && stable > 0 {
+				realization = fmt.Sprintf("%.1f%%", stable/gpu.DefaultPowerLimitW*100)
+			}
+			derated := "-"
+			if gpu.Derated {
+				derated = "⚠ yes"
+			}
+			fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n",
+				gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated)
+		}
+		platformReal := "-"
+		if totalDefault > 0 && totalStable > 0 {
+			platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
+		}
+		fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n",
+			totalDefault, totalStable, platformReal)
+
+		// Balance across GPUs — only meaningful with 2+ GPUs.
+		if len(result.GPUs) > 1 {
+			var minS, maxS, sumS float64
+			var cnt int
+			for _, gpu := range result.GPUs {
+				s := gpu.StablePowerLimitW
+				if s <= 0 {
+					s = gpu.AppliedPowerLimitW
+				}
+				if s <= 0 {
+					continue
+				}
+				sumS += s
+				cnt++
+				if cnt == 1 || s < minS {
+					minS = s
+				}
+				if s > maxS {
+					maxS = s
+				}
+			}
+			if cnt > 0 {
+				avg := sumS / float64(cnt)
+				spread := (maxS - minS) / avg * 100
+				balanceNote := "✓ balanced"
+				switch {
+				case spread > 20:
+					balanceNote = "⚠ significant imbalance — check slot thermals"
+				case spread > 10:
+					balanceNote = "— minor imbalance"
+				}
+				fmt.Fprintf(&b, "**GPU power balance:** avg %.0f W · min %.0f W · max %.0f W · spread %.1f%% — %s\n\n",
+					avg, minS, maxS, spread, balanceNote)
+			}
+		}
+
+		// Ramp scalability table — power efficiency of adding each GPU.
+		if len(result.RampSteps) > 1 {
+			b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
+			b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n")
+			b.WriteString("|------|------|-----------------------|-------------|---------------------|\n")
+			// First GPU stable TDP as the reference unit for efficiency.
+			var firstStable float64
+			if len(result.GPUs) > 0 {
+				firstStable = result.GPUs[0].StablePowerLimitW
+				if firstStable <= 0 {
+					firstStable = result.GPUs[0].AppliedPowerLimitW
+				}
+			}
+			var prevCumulative float64
+			for _, step := range result.RampSteps {
+				var cumulative float64
+				for _, gpuIdx := range step.GPUIndices {
+					for _, g := range result.GPUs {
+						if g.Index != gpuIdx {
+							continue
+						}
+						s := g.StablePowerLimitW
+						if s <= 0 {
+							s = g.AppliedPowerLimitW
+						}
+						cumulative += s
+					}
+				}
+				incremental := cumulative - prevCumulative
+				efficiency := "—"
+				if step.StepIndex > 1 && firstStable > 0 {
+					efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
+				}
+				fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n",
+					step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency)
+				prevCumulative = cumulative
+			}
+			b.WriteString("\n")
+		}
+	}
+
 	b.WriteString("## Per-Slot Results\n\n")
 	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
 	b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
@@ -3440,6 +3870,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		serverIdleOK = true
 		logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
 	}
+	sdrIdle := sampleIPMISDRPowerSensors()
 	psuBefore := psuStatusSnapshot()

 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
@@ -3753,11 +4184,52 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	}

 	// Characterize server power from IPMI idle/loaded samples.
-	// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
-	// ReportingRatio = IPMI_delta / GPU_reported_sum:
-	//   ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
+	// gpuActualSumW = sum of p95 GPU power from the last ramp step — actual
+	// measured consumption, not the stable limit cap. This is the correct
+	// denominator for the reporting ratio: limit caps (PlatformMaxTDPW) inflate
+	// the denominator and make the ratio appear artificially low.
+	var gpuActualSumW float64
+	if n := len(result.RampSteps); n > 0 {
+		gpuActualSumW = result.RampSteps[n-1].TotalObservedPowerW
+	}
+	if gpuActualSumW <= 0 {
+		gpuActualSumW = result.PlatformMaxTDPW
+	}
 	_ = serverIdleOK // used implicitly via characterizeServerPower
-	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
+	// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
+	// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
+	if result.ServerPower != nil {
+		sdrLoaded := sampleIPMISDRPowerSensors()
+		result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
+		result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
+		result.ServerPower.PSUOutputIdleW = sdrIdle.PSUOutW
+		result.ServerPower.PSUOutputLoadedW = sdrLoaded.PSUOutW
+		result.ServerPower.GPUSlotTotalW = sdrLoaded.GPUSlotW
+		if len(sdrIdle.PSUSlots) > 0 {
+			result.ServerPower.PSUSlotReadingsIdle = sdrIdle.PSUSlots
+		}
+		if len(sdrLoaded.PSUSlots) > 0 {
+			result.ServerPower.PSUSlotReadingsLoaded = sdrLoaded.PSUSlots
+		}
+		if sdrIdle.PSUInW > 0 && result.ServerPower.IdleW > 0 {
+			result.ServerPower.DCMICoverageRatio = result.ServerPower.IdleW / sdrIdle.PSUInW
+		}
+		if len(sdrLoaded.SkippedSensors) > 0 {
+			result.ServerPower.Notes = append(result.ServerPower.Notes,
+				"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
+		}
+		// Detect DCMI partial coverage: direct SDR comparison first,
+		// ramp heuristic as fallback when SDR PSU sensors are absent.
+		dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
+			(sdrIdle.PSUInW == 0 && detectIPMISaturationFallback(result.RampSteps))
+		if dcmiUnreliable {
+			result.ServerPower.Notes = append(result.ServerPower.Notes,
+				fmt.Sprintf("IPMI DCMI covers only a subset of installed PSUs (coverage %.0f%%). "+
+					"Use SDR PSU Δ ratio for GPU accuracy assessment; DCMI ratio is not reliable.",
+					result.ServerPower.DCMICoverageRatio*100))
+		}
+	}
 	result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
 	// Write top-level gpu-metrics.csv/.html aggregating all phases.
 	writeBenchmarkMetricsFiles(runDir, allPowerRows)
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -275,18 +275,55 @@ type BenchmarkScorecard struct {
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }

-// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
-// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
-// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
-// over-reporting its power consumption.
+// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
+// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
+// so benchmark and audit data can be correlated by slot.
+type BenchmarkPSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`  // AC wall input (PSUx_POWER_IN)
+	OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
+	Status  string   `json:"status,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power from multiple independent
+// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
+// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
+// covers only a subset of installed PSUs (partial coverage).
+//
+// Source legend:
+//   - DCMI      — `ipmitool dcmi power reading`; fast but may miss PSUs
+//   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
+//   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
-	Available       bool     `json:"available"`
-	IdleW           float64  `json:"idle_w,omitempty"`
-	LoadedW         float64  `json:"loaded_w,omitempty"`
-	DeltaW          float64  `json:"delta_w,omitempty"`
-	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
-	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
-	Notes           []string `json:"notes,omitempty"`
+	Available       bool    `json:"available"`
+	IdleW           float64 `json:"idle_w,omitempty"`   // DCMI at idle
+	LoadedW         float64 `json:"loaded_w,omitempty"` // DCMI at peak load
+	DeltaW          float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
+	GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio  float64 `json:"reporting_ratio,omitempty"`
+
+	// PSU AC input sum — sampled at idle and at peak load using collector's
+	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
+	PSUInputIdleW   float64 `json:"psu_input_idle_w,omitempty"`
+	PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
+
+	// PSU DC output sum — power delivered to server internals after conversion.
+	PSUOutputIdleW   float64 `json:"psu_output_idle_w,omitempty"`
+	PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
+
+	// Per-slot PSU readings at idle and at peak load.
+	// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
+	PSUSlotReadingsIdle   map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
+	PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
+
+	// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
+	// PCIe slot delivery only (excludes 16-pin connector power).
+	GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
+
+	// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
+	// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
+	DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
+
+	Notes []string `json:"notes,omitempty"`
 }

 // BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected