Add multi-source PSU power triangulation and per-slot distribution table

- collector/psu.go: export PSUSlotsFromSDR() reusing slot regex patterns;
  add isPSUInputPower/isPSUOutputPower helpers covering MSI/MLT/xFusion/HPE
  naming; add xFusion Power<N> slot pattern; parseBoundedFloat for self-healing
  (rejects zero/negative/out-of-range sensor readings); default fallback treats
  unclassified PSU sensors as AC input
- benchmark_types.go: BenchmarkPSUSlotPower struct; BenchmarkServerPower gains
  PSUInputIdle/Loaded, PSUOutputIdle/Loaded, PSUSlotReadingsIdle/Loaded,
  GPUSlotTotalW, DCMICoverageRatio fields
- benchmark.go: sampleIPMISDRPowerSensors uses collector.PSUSlotsFromSDR instead
  of custom classifier; detectDCMIPartialCoverage replaces ramp heuristic —
  compares DCMI idle vs SDR PSU sum, flags <0.70 ratio as partial coverage;
  detectIPMISaturationFallback kept for servers without SDR PSU sensors;
  report gains PSU Load Distribution table (per-slot AC/DC idle vs loaded, Δ)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 13:07:48 +03:00
parent d60f7758ba
commit 61c7abaa80
3 changed files with 624 additions and 37 deletions

View File

@@ -160,11 +160,54 @@ type psuSDR struct {
}
var psuSlotPatterns = []*regexp.Regexp{
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
// Must be last: "power supply N" is already caught by the pattern above.
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
}
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
// MSI: PSU1_POWER_IN, PSU1_PIN
// MLT: PSU1_PIN
// xFusion: (matched via default fallback — no explicit keyword)
// HPE: PS1 Input Power, PS1 Input Watts
func isPSUInputPower(name string) bool {
return strings.Contains(name, "input power") ||
strings.Contains(name, "input watts") ||
strings.Contains(name, "_pin") ||
strings.Contains(name, " pin") ||
strings.Contains(name, "_power_in") ||
strings.Contains(name, "power_in")
}
// isPSUOutputPower matches DC-output power sensor names across vendors:
// MSI: PSU1_POWER_OUT
// MLT: PSU1_POUT
// xFusion: PS1 POut
func isPSUOutputPower(name string) bool {
return strings.Contains(name, "output power") ||
strings.Contains(name, "output watts") ||
strings.Contains(name, "_pout") ||
strings.Contains(name, " pout") ||
strings.Contains(name, "_power_out") ||
strings.Contains(name, "power_out") ||
strings.Contains(name, "power supply bay") ||
strings.Contains(name, "psu bay")
}
// parseBoundedFloat parses a numeric value from an SDR value field and
// validates it is within (0, max]. Returns nil for zero, negative, or
// out-of-range values — these indicate missing/off/fault sensor readings.
func parseBoundedFloat(raw string, max float64) *float64 {
v := parseFloatPtr(raw)
if v == nil || *v <= 0 || *v > max {
return nil
}
return v
}
func parsePSUSDR(raw string) map[int]psuSDR {
@@ -194,24 +237,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
lowerName := strings.ToLower(name)
switch {
case strings.Contains(lowerName, "input power"):
entry.inputPowerW = parseFloatPtr(value)
case strings.Contains(lowerName, "output power"):
entry.outputPowerW = parseFloatPtr(value)
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
entry.outputPowerW = parseFloatPtr(value)
case isPSUInputPower(lowerName):
entry.inputPowerW = parseBoundedFloat(value, 6000)
case isPSUOutputPower(lowerName):
entry.outputPowerW = parseBoundedFloat(value, 6000)
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
entry.inputVoltage = parseFloatPtr(value)
case strings.Contains(lowerName, "temp"):
entry.temperatureC = parseFloatPtr(value)
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
entry.healthPct = parsePercentPtr(value)
default:
// Generic PSU power reading: sensor matched a slot pattern but carries
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
// AC input if the value looks like wattage and no better data is set yet.
if entry.inputPowerW == nil {
entry.inputPowerW = parseBoundedFloat(value, 6000)
}
}
out[slot] = entry
}
return out
}
// PSUSlotPower holds SDR power readings for one PSU slot.
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
// matching HardwarePowerSupply.Slot in the audit schema.
type PSUSlotPower struct {
InputW *float64 `json:"input_w,omitempty"`
OutputW *float64 `json:"output_w,omitempty"`
Status string `json:"status,omitempty"`
}
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
// using the same battle-tested slot patterns as the hardware audit collector.
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
sdr := parsePSUSDR(sdrOutput)
if len(sdr) == 0 {
return nil
}
out := make(map[string]PSUSlotPower, len(sdr))
for slot, entry := range sdr {
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
out[key] = PSUSlotPower{
InputW: entry.inputPowerW,
OutputW: entry.outputPowerW,
Status: entry.status,
}
}
return out
}
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
if len(sdr) == 0 {
return nil

View File

@@ -1,6 +1,7 @@
package platform
import (
"bee/audit/internal/collector"
"context"
"encoding/csv"
"encoding/json"
@@ -2025,11 +2026,17 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
}
}
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
if sp.ReportingRatio < 0.75 {
dcmiPartial := detectDCMIPartialCoverage(sp)
if sp.ReportingRatio < 0.75 && !dcmiPartial {
findings = append(findings, fmt.Sprintf(
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
))
} else if sp.ReportingRatio < 0.75 && dcmiPartial {
findings = append(findings, fmt.Sprintf(
"IPMI DCMI covers partial PSU set (DCMI/SDR coverage %.0f%%): ratio %.2f reflects DCMI under-reporting, not GPU inaccuracy. GPU telemetry is the reliable power source; use SDR-based ratio for server-side accuracy.",
sp.DCMICoverageRatio*100, sp.ReportingRatio,
))
} else if sp.ReportingRatio > 1.25 {
findings = append(findings, fmt.Sprintf(
"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
@@ -2216,6 +2223,66 @@ func maxInt(a, b int) int {
return b
}
// detectDCMIPartialCoverage returns true when IPMI DCMI under-reports actual
// server power by comparing DCMI readings against SDR PSUx_POWER_IN sensor sums.
//
// Primary check: DCMI_idle / SDR_PSU_IN_idle — most reliable because GPU load
// is zero, so both sources measure the same server state. A ratio below 0.7
// means DCMI misses ≥30% of installed PSUs (e.g. 0.50 = sees 2 of 4 PSUs).
//
// Fallback: DCMI_loaded / SDR_PSU_IN_loaded — less precise (GPU load may
// affect different PSUs differently) but still useful when idle SDR is absent.
//
// Returns false when SDR data is unavailable (server has no PSUx_POWER_IN
// sensors); the heuristic detectIPMISaturationFallback is used in that case.
func detectDCMIPartialCoverage(sp *BenchmarkServerPower) bool {
if sp == nil || !sp.Available {
return false
}
if sp.PSUInputIdleW > 0 && sp.IdleW > 0 {
return sp.IdleW/sp.PSUInputIdleW < 0.7
}
if sp.PSUInputLoadedW > 0 && sp.LoadedW > 0 {
return sp.LoadedW/sp.PSUInputLoadedW < 0.7
}
return false
}
// detectIPMISaturationFallback is the heuristic used when SDR PSU sensors are
// absent. It analyses the power ramp: if 2+ of the last 3 per-step incremental
// DCMI/GPU ratios fall below 25% of the first-step ratio, DCMI has likely
// plateaued while GPU load continued to grow (saturation proxy).
// Prefer detectDCMIPartialCoverage when SDR data is available.
func detectIPMISaturationFallback(steps []NvidiaPowerBenchStep) bool {
type pt struct{ incIPMI, incGPU float64 }
var pts []pt
for i := 1; i < len(steps); i++ {
if steps[i].ServerDeltaW <= 0 || steps[i-1].ServerDeltaW <= 0 {
continue
}
incIPMI := steps[i].ServerDeltaW - steps[i-1].ServerDeltaW
incGPU := steps[i].TotalObservedPowerW - steps[i-1].TotalObservedPowerW
if incGPU <= 0 {
continue
}
pts = append(pts, pt{incIPMI, incGPU})
}
if len(pts) < 3 {
return false
}
refRatio := pts[0].incIPMI / pts[0].incGPU
if refRatio <= 0 {
return false
}
saturated := 0
for _, p := range pts[len(pts)-3:] {
if p.incIPMI/p.incGPU < refRatio*0.25 {
saturated++
}
}
return saturated >= 2
}
// psuStatusSnapshot samples PSU health sensor states via
// `ipmitool sdr type "Power Supply"`. Returns a map of sensor name → reading
// string (e.g. "Presence detected", "Failure detected"). Returns nil when IPMI
@@ -2276,6 +2343,148 @@ func diffPSUStatus(before, after map[string]string) []string {
return issues
}
// sdrPowerSnapshot holds per-source power sums from a single `ipmitool sdr` read.
type sdrPowerSnapshot struct {
PSUInW float64 // sum of PSU AC input across all slots
PSUOutW float64 // sum of PSU DC output across all slots
GPUSlotW float64 // sum of GPU slot/GPU power sensors
// Per-slot PSU data from collector.PSUSlotsFromSDR — same slot keys as
// audit HardwarePowerSupply.Slot (0-based strings).
PSUSlots map[string]BenchmarkPSUSlotPower
SkippedSensors []string // sensors rejected during self-healing
}
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
type sdrSensor struct {
name string
watts float64
}
// filterSensorGroup removes physically implausible readings from a group.
// Hard bounds: 0 < watts ≤ maxPerSensorW. Within groups of 2+ sensors,
// values more than 5× the group median are rejected as stuck/fault sensors.
func filterSensorGroup(sensors []sdrSensor, maxPerSensorW float64) (valid []sdrSensor, skipped []string) {
var inBounds []sdrSensor
for _, s := range sensors {
if s.watts <= 0 || s.watts > maxPerSensorW {
skipped = append(skipped, fmt.Sprintf("%s (%.0f W: out of range 0%.0f W)", s.name, s.watts, maxPerSensorW))
} else {
inBounds = append(inBounds, s)
}
}
if len(inBounds) < 2 {
return inBounds, skipped
}
vals := make([]float64, len(inBounds))
for i, s := range inBounds {
vals[i] = s.watts
}
sort.Float64s(vals)
mid := len(vals) / 2
var median float64
if len(vals)%2 == 0 {
median = (vals[mid-1] + vals[mid]) / 2
} else {
median = vals[mid]
}
for _, s := range inBounds {
if median > 0 && s.watts > median*5 {
skipped = append(skipped, fmt.Sprintf("%s (%.0f W: >5× median %.0f W, likely sensor fault)", s.name, s.watts, median))
} else {
valid = append(valid, s)
}
}
return valid, skipped
}
// sampleIPMISDRPowerSensors reads power sensors from `ipmitool sdr` in a single
// invocation and returns self-healed grouped sums.
//
// PSU identification delegates to collector.PSUSlotsFromSDR which uses the same
// slot-detection regexes as the hardware audit (PSU1_POWER_IN, PSU1_PIN, PS1 POut,
// Power1…). Self-healing: bounds checking + 5× median outlier rejection.
//
// GPU slot sensors (GPU_POWER_SLOTx, GPU1 Power, …) are classified separately
// since the audit collector does not track GPU PCIe slot power.
func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
raw, err := exec.Command("ipmitool", "sdr").Output()
if err != nil || len(raw) == 0 {
return sdrPowerSnapshot{}
}
sdrStr := string(raw)
var snap sdrPowerSnapshot
// ── PSU data via audit collector ─────────────────────────────────────────
// collector.PSUSlotsFromSDR handles all vendor naming variants and applies
// bounds checking inside parseBoundedFloat (0 < w ≤ 6000 W).
collectorSlots := collector.PSUSlotsFromSDR(sdrStr)
// Convert to benchmark type and apply cross-slot median filtering.
var psuInSensors, psuOutSensors []sdrSensor
for slotKey, sp := range collectorSlots {
bsp := BenchmarkPSUSlotPower{Status: sp.Status}
if sp.InputW != nil {
bsp.InputW = sp.InputW
psuInSensors = append(psuInSensors, sdrSensor{name: "PSU-slot-" + slotKey, watts: *sp.InputW})
}
if sp.OutputW != nil {
bsp.OutputW = sp.OutputW
psuOutSensors = append(psuOutSensors, sdrSensor{name: "PSU-slot-" + slotKey + "-out", watts: *sp.OutputW})
}
if snap.PSUSlots == nil {
snap.PSUSlots = make(map[string]BenchmarkPSUSlotPower)
}
snap.PSUSlots[slotKey] = bsp
}
// Apply cross-slot outlier filter and sum.
validIn, skIn := filterSensorGroup(psuInSensors, 6000)
for _, s := range validIn {
snap.PSUInW += s.watts
}
snap.SkippedSensors = append(snap.SkippedSensors, skIn...)
validOut, skOut := filterSensorGroup(psuOutSensors, 6000)
for _, s := range validOut {
snap.PSUOutW += s.watts
}
snap.SkippedSensors = append(snap.SkippedSensors, skOut...)
// ── GPU slot sensors ─────────────────────────────────────────────────────
// collector does not track GPU PCIe slot power; classify here.
// Matches: GPU_POWER_SLOTx (MSI), GPU1 Power (xFusion), GPU_PWR_x (generic).
var gpuSensors []sdrSensor
for _, line := range strings.Split(sdrStr, "\n") {
parts := strings.Split(line, "|")
if len(parts) < 2 {
continue
}
name := strings.TrimSpace(parts[0])
nameLower := strings.ToLower(name)
if !strings.Contains(nameLower, "gpu") {
continue
}
if !strings.Contains(nameLower, "slot") && !strings.Contains(nameLower, "power") &&
!strings.Contains(nameLower, "pwr") {
continue
}
var w float64
if n, _ := fmt.Sscanf(strings.TrimSpace(parts[1]), "%f Watts", &w); n != 1 {
continue
}
gpuSensors = append(gpuSensors, sdrSensor{name: name, watts: w})
}
validGPU, skGPU := filterSensorGroup(gpuSensors, 2000)
for _, s := range validGPU {
snap.GPUSlotW += s.watts
}
snap.SkippedSensors = append(snap.SkippedSensors, skGPU...)
return snap
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
@@ -3255,23 +3464,51 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW)
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio)
fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW)
if sp.PSUInputLoadedW > 0 {
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta)
}
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio)
}
b.WriteString("\n")
// Server power comparison table.
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n")
b.WriteString("| Metric | Value |\n")
b.WriteString("|--------|-------|\n")
fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
b.WriteString("| Metric | Source | Value |\n")
b.WriteString("|--------|--------|-------|\n")
fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW)
fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW)
if sp.GPUSlotTotalW > 0 {
fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW)
}
if sp.Available {
fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server Δ power (loaded idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW)
}
if sp.PSUInputLoadedW > 0 {
fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW)
fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW)
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
fmt.Fprintf(&b, "| PSU AC input Δ (loaded idle) | IPMI SDR | %.0f W |\n", psuDelta)
}
if sp.PSUOutputLoadedW > 0 {
fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW)
fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW)
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff)
}
}
if sp.Available {
fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server Δ power (loaded idle) | %.0f W |\n", sp.DeltaW)
ratio := sp.ReportingRatio
dcmiPartial := detectDCMIPartialCoverage(sp) ||
(sp.PSUInputIdleW == 0 && detectIPMISaturationFallback(result.RampSteps))
ratioNote := ""
switch {
case dcmiPartial:
ratioNote = "⚠ IPMI DCMI covers partial PSU set; use SDR ratio below for accuracy assessment"
case ratio >= 0.9:
ratioNote = "✓ GPU telemetry matches server power"
case ratio >= 0.75:
@@ -3279,14 +3516,83 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote)
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
sdrRatio := psuDelta / sp.GPUReportedSumW
sdrNote := ""
switch {
case sdrRatio >= 0.9:
sdrNote = "✓ GPU telemetry matches wall power"
case sdrRatio >= 0.75:
sdrNote = "⚠ minor discrepancy"
default:
sdrNote = "✗ significant discrepancy"
}
fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote)
}
} else {
b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n")
}
for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note)
}
b.WriteString("\n")
if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
b.WriteString("## PSU Load Distribution\n\n")
b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n")
b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n")
// collect all slot keys
slotSet := map[string]struct{}{}
for k := range sp.PSUSlotReadingsIdle {
slotSet[k] = struct{}{}
}
for k := range sp.PSUSlotReadingsLoaded {
slotSet[k] = struct{}{}
}
slots := make([]string, 0, len(slotSet))
for k := range slotSet {
slots = append(slots, k)
}
sort.Strings(slots)
for _, slot := range slots {
idle := sp.PSUSlotReadingsIdle[slot]
loaded := sp.PSUSlotReadingsLoaded[slot]
fmtW := func(v *float64) string {
if v == nil {
return "—"
}
return fmt.Sprintf("%.0f W", *v)
}
var deltaStr string
if idle.InputW != nil && loaded.InputW != nil {
deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
} else {
deltaStr = "—"
}
status := loaded.Status
if status == "" {
status = idle.Status
}
if status == "" {
status = "—"
}
fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n",
slot,
fmtW(idle.InputW), fmtW(loaded.InputW),
fmtW(idle.OutputW), fmtW(loaded.OutputW),
deltaStr, status,
)
}
b.WriteString("\n")
}
}
if len(result.Findings) > 0 {
@@ -3318,6 +3624,130 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
b.WriteString("\n")
}
// ── PSU Issues ────────────────────────────────────────────────────────────
if len(result.PSUIssues) > 0 {
b.WriteString("## PSU Issues\n\n")
b.WriteString("The following power supply anomalies were detected during the test:\n\n")
for _, issue := range result.PSUIssues {
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
}
b.WriteString("\n")
}
// ── Power Distribution Summary ────────────────────────────────────────────
b.WriteString("## Power Distribution Summary\n\n")
{
var totalDefault, totalStable float64
for _, gpu := range result.GPUs {
stable := gpu.StablePowerLimitW
if stable <= 0 {
stable = gpu.AppliedPowerLimitW
}
totalDefault += gpu.DefaultPowerLimitW
totalStable += stable
}
b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n")
b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n")
for _, gpu := range result.GPUs {
stable := gpu.StablePowerLimitW
if stable <= 0 {
stable = gpu.AppliedPowerLimitW
}
realization := "-"
if gpu.DefaultPowerLimitW > 0 && stable > 0 {
realization = fmt.Sprintf("%.1f%%", stable/gpu.DefaultPowerLimitW*100)
}
derated := "-"
if gpu.Derated {
derated = "⚠ yes"
}
fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n",
gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated)
}
platformReal := "-"
if totalDefault > 0 && totalStable > 0 {
platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
}
fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n",
totalDefault, totalStable, platformReal)
// Balance across GPUs — only meaningful with 2+ GPUs.
if len(result.GPUs) > 1 {
var minS, maxS, sumS float64
var cnt int
for _, gpu := range result.GPUs {
s := gpu.StablePowerLimitW
if s <= 0 {
s = gpu.AppliedPowerLimitW
}
if s <= 0 {
continue
}
sumS += s
cnt++
if cnt == 1 || s < minS {
minS = s
}
if s > maxS {
maxS = s
}
}
if cnt > 0 {
avg := sumS / float64(cnt)
spread := (maxS - minS) / avg * 100
balanceNote := "✓ balanced"
switch {
case spread > 20:
balanceNote = "⚠ significant imbalance — check slot thermals"
case spread > 10:
balanceNote = "— minor imbalance"
}
fmt.Fprintf(&b, "**GPU power balance:** avg %.0f W · min %.0f W · max %.0f W · spread %.1f%% — %s\n\n",
avg, minS, maxS, spread, balanceNote)
}
}
// Ramp scalability table — power efficiency of adding each GPU.
if len(result.RampSteps) > 1 {
b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n")
b.WriteString("|------|------|-----------------------|-------------|---------------------|\n")
// First GPU stable TDP as the reference unit for efficiency.
var firstStable float64
if len(result.GPUs) > 0 {
firstStable = result.GPUs[0].StablePowerLimitW
if firstStable <= 0 {
firstStable = result.GPUs[0].AppliedPowerLimitW
}
}
var prevCumulative float64
for _, step := range result.RampSteps {
var cumulative float64
for _, gpuIdx := range step.GPUIndices {
for _, g := range result.GPUs {
if g.Index != gpuIdx {
continue
}
s := g.StablePowerLimitW
if s <= 0 {
s = g.AppliedPowerLimitW
}
cumulative += s
}
}
incremental := cumulative - prevCumulative
efficiency := "—"
if step.StepIndex > 1 && firstStable > 0 {
efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
}
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n",
step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency)
prevCumulative = cumulative
}
b.WriteString("\n")
}
}
b.WriteString("## Per-Slot Results\n\n")
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
@@ -3440,6 +3870,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
sdrIdle := sampleIPMISDRPowerSensors()
psuBefore := psuStatusSnapshot()
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
@@ -3753,11 +4184,52 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
// Characterize server power from IPMI idle/loaded samples.
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
// ReportingRatio = IPMI_delta / GPU_reported_sum:
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
// gpuActualSumW = sum of p95 GPU power from the last ramp step — actual
// measured consumption, not the stable limit cap. This is the correct
// denominator for the reporting ratio: limit caps (PlatformMaxTDPW) inflate
// the denominator and make the ratio appear artificially low.
var gpuActualSumW float64
if n := len(result.RampSteps); n > 0 {
gpuActualSumW = result.RampSteps[n-1].TotalObservedPowerW
}
if gpuActualSumW <= 0 {
gpuActualSumW = result.PlatformMaxTDPW
}
_ = serverIdleOK // used implicitly via characterizeServerPower
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
if result.ServerPower != nil {
sdrLoaded := sampleIPMISDRPowerSensors()
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
result.ServerPower.PSUOutputIdleW = sdrIdle.PSUOutW
result.ServerPower.PSUOutputLoadedW = sdrLoaded.PSUOutW
result.ServerPower.GPUSlotTotalW = sdrLoaded.GPUSlotW
if len(sdrIdle.PSUSlots) > 0 {
result.ServerPower.PSUSlotReadingsIdle = sdrIdle.PSUSlots
}
if len(sdrLoaded.PSUSlots) > 0 {
result.ServerPower.PSUSlotReadingsLoaded = sdrLoaded.PSUSlots
}
if sdrIdle.PSUInW > 0 && result.ServerPower.IdleW > 0 {
result.ServerPower.DCMICoverageRatio = result.ServerPower.IdleW / sdrIdle.PSUInW
}
if len(sdrLoaded.SkippedSensors) > 0 {
result.ServerPower.Notes = append(result.ServerPower.Notes,
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
}
// Detect DCMI partial coverage: direct SDR comparison first,
// ramp heuristic as fallback when SDR PSU sensors are absent.
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
(sdrIdle.PSUInW == 0 && detectIPMISaturationFallback(result.RampSteps))
if dcmiUnreliable {
result.ServerPower.Notes = append(result.ServerPower.Notes,
fmt.Sprintf("IPMI DCMI covers only a subset of installed PSUs (coverage %.0f%%). "+
"Use SDR PSU Δ ratio for GPU accuracy assessment; DCMI ratio is not reliable.",
result.ServerPower.DCMICoverageRatio*100))
}
}
result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
// Write top-level gpu-metrics.csv/.html aggregating all phases.
writeBenchmarkMetricsFiles(runDir, allPowerRows)

View File

@@ -275,18 +275,55 @@ type BenchmarkScorecard struct {
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
}
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
// over-reporting its power consumption.
// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
// so benchmark and audit data can be correlated by slot.
type BenchmarkPSUSlotPower struct {
InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN)
OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
Status string `json:"status,omitempty"`
}
// BenchmarkServerPower captures server-side power from multiple independent
// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
// covers only a subset of installed PSUs (partial coverage).
//
// Source legend:
// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
type BenchmarkServerPower struct {
Available bool `json:"available"`
IdleW float64 `json:"idle_w,omitempty"`
LoadedW float64 `json:"loaded_w,omitempty"`
DeltaW float64 `json:"delta_w,omitempty"`
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
Notes []string `json:"notes,omitempty"`
Available bool `json:"available"`
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded idle
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
// PSU AC input sum — sampled at idle and at peak load using collector's
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"`
PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
// PSU DC output sum — power delivered to server internals after conversion.
PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"`
PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
// Per-slot PSU readings at idle and at peak load.
// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
// PCIe slot delivery only (excludes 16-pin connector power).
GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
Notes []string `json:"notes,omitempty"`
}
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected