Add multi-source PSU power triangulation and per-slot distribution table
- collector/psu.go: export PSUSlotsFromSDR() reusing slot regex patterns; add isPSUInputPower/isPSUOutputPower helpers covering MSI/MLT/xFusion/HPE naming; add xFusion Power<N> slot pattern; parseBoundedFloat for self-healing (rejects zero/negative/out-of-range sensor readings); default fallback treats unclassified PSU sensors as AC input - benchmark_types.go: BenchmarkPSUSlotPower struct; BenchmarkServerPower gains PSUInputIdle/Loaded, PSUOutputIdle/Loaded, PSUSlotReadingsIdle/Loaded, GPUSlotTotalW, DCMICoverageRatio fields - benchmark.go: sampleIPMISDRPowerSensors uses collector.PSUSlotsFromSDR instead of custom classifier; detectDCMIPartialCoverage replaces ramp heuristic — compares DCMI idle vs SDR PSU sum, flags <0.70 ratio as partial coverage; detectIPMISaturationFallback kept for servers without SDR PSU sensors; report gains PSU Load Distribution table (per-slot AC/DC idle vs loaded, Δ) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -160,11 +160,54 @@ type psuSDR struct {
|
||||
}
|
||||
|
||||
var psuSlotPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
|
||||
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
|
||||
// Must be last: "power supply N" is already caught by the pattern above.
|
||||
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
|
||||
}
|
||||
|
||||
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_IN, PSU1_PIN
|
||||
// MLT: PSU1_PIN
|
||||
// xFusion: (matched via default fallback — no explicit keyword)
|
||||
// HPE: PS1 Input Power, PS1 Input Watts
|
||||
func isPSUInputPower(name string) bool {
|
||||
return strings.Contains(name, "input power") ||
|
||||
strings.Contains(name, "input watts") ||
|
||||
strings.Contains(name, "_pin") ||
|
||||
strings.Contains(name, " pin") ||
|
||||
strings.Contains(name, "_power_in") ||
|
||||
strings.Contains(name, "power_in")
|
||||
}
|
||||
|
||||
// isPSUOutputPower matches DC-output power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_OUT
|
||||
// MLT: PSU1_POUT
|
||||
// xFusion: PS1 POut
|
||||
func isPSUOutputPower(name string) bool {
|
||||
return strings.Contains(name, "output power") ||
|
||||
strings.Contains(name, "output watts") ||
|
||||
strings.Contains(name, "_pout") ||
|
||||
strings.Contains(name, " pout") ||
|
||||
strings.Contains(name, "_power_out") ||
|
||||
strings.Contains(name, "power_out") ||
|
||||
strings.Contains(name, "power supply bay") ||
|
||||
strings.Contains(name, "psu bay")
|
||||
}
|
||||
|
||||
// parseBoundedFloat parses a numeric value from an SDR value field and
|
||||
// validates it is within (0, max]. Returns nil for zero, negative, or
|
||||
// out-of-range values — these indicate missing/off/fault sensor readings.
|
||||
func parseBoundedFloat(raw string, max float64) *float64 {
|
||||
v := parseFloatPtr(raw)
|
||||
if v == nil || *v <= 0 || *v > max {
|
||||
return nil
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
@@ -194,24 +237,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
|
||||
lowerName := strings.ToLower(name)
|
||||
switch {
|
||||
case strings.Contains(lowerName, "input power"):
|
||||
entry.inputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "output power"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case isPSUInputPower(lowerName):
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
case isPSUOutputPower(lowerName):
|
||||
entry.outputPowerW = parseBoundedFloat(value, 6000)
|
||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||
entry.inputVoltage = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "temp"):
|
||||
entry.temperatureC = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||
entry.healthPct = parsePercentPtr(value)
|
||||
default:
|
||||
// Generic PSU power reading: sensor matched a slot pattern but carries
|
||||
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
|
||||
// AC input if the value looks like wattage and no better data is set yet.
|
||||
if entry.inputPowerW == nil {
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
}
|
||||
}
|
||||
out[slot] = entry
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// PSUSlotPower holds SDR power readings for one PSU slot.
|
||||
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
|
||||
// matching HardwarePowerSupply.Slot in the audit schema.
|
||||
type PSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"`
|
||||
OutputW *float64 `json:"output_w,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
|
||||
// using the same battle-tested slot patterns as the hardware audit collector.
|
||||
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
|
||||
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
|
||||
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
|
||||
sdr := parsePSUSDR(sdrOutput)
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]PSUSlotPower, len(sdr))
|
||||
for slot, entry := range sdr {
|
||||
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
|
||||
out[key] = PSUSlotPower{
|
||||
InputW: entry.inputPowerW,
|
||||
OutputW: entry.outputPowerW,
|
||||
Status: entry.status,
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bee/audit/internal/collector"
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"encoding/json"
|
||||
@@ -2025,11 +2026,17 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||
}
|
||||
}
|
||||
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
||||
if sp.ReportingRatio < 0.75 {
|
||||
dcmiPartial := detectDCMIPartialCoverage(sp)
|
||||
if sp.ReportingRatio < 0.75 && !dcmiPartial {
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
|
||||
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
|
||||
))
|
||||
} else if sp.ReportingRatio < 0.75 && dcmiPartial {
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"IPMI DCMI covers partial PSU set (DCMI/SDR coverage %.0f%%): ratio %.2f reflects DCMI under-reporting, not GPU inaccuracy. GPU telemetry is the reliable power source; use SDR-based ratio for server-side accuracy.",
|
||||
sp.DCMICoverageRatio*100, sp.ReportingRatio,
|
||||
))
|
||||
} else if sp.ReportingRatio > 1.25 {
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
|
||||
@@ -2216,6 +2223,66 @@ func maxInt(a, b int) int {
|
||||
return b
|
||||
}
|
||||
|
||||
// detectDCMIPartialCoverage returns true when IPMI DCMI under-reports actual
|
||||
// server power by comparing DCMI readings against SDR PSUx_POWER_IN sensor sums.
|
||||
//
|
||||
// Primary check: DCMI_idle / SDR_PSU_IN_idle — most reliable because GPU load
|
||||
// is zero, so both sources measure the same server state. A ratio below 0.7
|
||||
// means DCMI misses ≥30% of installed PSUs (e.g. 0.50 = sees 2 of 4 PSUs).
|
||||
//
|
||||
// Fallback: DCMI_loaded / SDR_PSU_IN_loaded — less precise (GPU load may
|
||||
// affect different PSUs differently) but still useful when idle SDR is absent.
|
||||
//
|
||||
// Returns false when SDR data is unavailable (server has no PSUx_POWER_IN
|
||||
// sensors); the heuristic detectIPMISaturationFallback is used in that case.
|
||||
func detectDCMIPartialCoverage(sp *BenchmarkServerPower) bool {
|
||||
if sp == nil || !sp.Available {
|
||||
return false
|
||||
}
|
||||
if sp.PSUInputIdleW > 0 && sp.IdleW > 0 {
|
||||
return sp.IdleW/sp.PSUInputIdleW < 0.7
|
||||
}
|
||||
if sp.PSUInputLoadedW > 0 && sp.LoadedW > 0 {
|
||||
return sp.LoadedW/sp.PSUInputLoadedW < 0.7
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// detectIPMISaturationFallback is the heuristic used when SDR PSU sensors are
|
||||
// absent. It analyses the power ramp: if 2+ of the last 3 per-step incremental
|
||||
// DCMI/GPU ratios fall below 25% of the first-step ratio, DCMI has likely
|
||||
// plateaued while GPU load continued to grow (saturation proxy).
|
||||
// Prefer detectDCMIPartialCoverage when SDR data is available.
|
||||
func detectIPMISaturationFallback(steps []NvidiaPowerBenchStep) bool {
|
||||
type pt struct{ incIPMI, incGPU float64 }
|
||||
var pts []pt
|
||||
for i := 1; i < len(steps); i++ {
|
||||
if steps[i].ServerDeltaW <= 0 || steps[i-1].ServerDeltaW <= 0 {
|
||||
continue
|
||||
}
|
||||
incIPMI := steps[i].ServerDeltaW - steps[i-1].ServerDeltaW
|
||||
incGPU := steps[i].TotalObservedPowerW - steps[i-1].TotalObservedPowerW
|
||||
if incGPU <= 0 {
|
||||
continue
|
||||
}
|
||||
pts = append(pts, pt{incIPMI, incGPU})
|
||||
}
|
||||
if len(pts) < 3 {
|
||||
return false
|
||||
}
|
||||
refRatio := pts[0].incIPMI / pts[0].incGPU
|
||||
if refRatio <= 0 {
|
||||
return false
|
||||
}
|
||||
saturated := 0
|
||||
for _, p := range pts[len(pts)-3:] {
|
||||
if p.incIPMI/p.incGPU < refRatio*0.25 {
|
||||
saturated++
|
||||
}
|
||||
}
|
||||
return saturated >= 2
|
||||
}
|
||||
|
||||
// psuStatusSnapshot samples PSU health sensor states via
|
||||
// `ipmitool sdr type "Power Supply"`. Returns a map of sensor name → reading
|
||||
// string (e.g. "Presence detected", "Failure detected"). Returns nil when IPMI
|
||||
@@ -2276,6 +2343,148 @@ func diffPSUStatus(before, after map[string]string) []string {
|
||||
return issues
|
||||
}
|
||||
|
||||
// sdrPowerSnapshot holds per-source power sums from a single `ipmitool sdr` read.
|
||||
type sdrPowerSnapshot struct {
|
||||
PSUInW float64 // sum of PSU AC input across all slots
|
||||
PSUOutW float64 // sum of PSU DC output across all slots
|
||||
GPUSlotW float64 // sum of GPU slot/GPU power sensors
|
||||
|
||||
// Per-slot PSU data from collector.PSUSlotsFromSDR — same slot keys as
|
||||
// audit HardwarePowerSupply.Slot (0-based strings).
|
||||
PSUSlots map[string]BenchmarkPSUSlotPower
|
||||
|
||||
SkippedSensors []string // sensors rejected during self-healing
|
||||
}
|
||||
|
||||
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
|
||||
type sdrSensor struct {
|
||||
name string
|
||||
watts float64
|
||||
}
|
||||
|
||||
// filterSensorGroup removes physically implausible readings from a group.
|
||||
// Hard bounds: 0 < watts ≤ maxPerSensorW. Within groups of 2+ sensors,
|
||||
// values more than 5× the group median are rejected as stuck/fault sensors.
|
||||
func filterSensorGroup(sensors []sdrSensor, maxPerSensorW float64) (valid []sdrSensor, skipped []string) {
|
||||
var inBounds []sdrSensor
|
||||
for _, s := range sensors {
|
||||
if s.watts <= 0 || s.watts > maxPerSensorW {
|
||||
skipped = append(skipped, fmt.Sprintf("%s (%.0f W: out of range 0–%.0f W)", s.name, s.watts, maxPerSensorW))
|
||||
} else {
|
||||
inBounds = append(inBounds, s)
|
||||
}
|
||||
}
|
||||
if len(inBounds) < 2 {
|
||||
return inBounds, skipped
|
||||
}
|
||||
vals := make([]float64, len(inBounds))
|
||||
for i, s := range inBounds {
|
||||
vals[i] = s.watts
|
||||
}
|
||||
sort.Float64s(vals)
|
||||
mid := len(vals) / 2
|
||||
var median float64
|
||||
if len(vals)%2 == 0 {
|
||||
median = (vals[mid-1] + vals[mid]) / 2
|
||||
} else {
|
||||
median = vals[mid]
|
||||
}
|
||||
for _, s := range inBounds {
|
||||
if median > 0 && s.watts > median*5 {
|
||||
skipped = append(skipped, fmt.Sprintf("%s (%.0f W: >5× median %.0f W, likely sensor fault)", s.name, s.watts, median))
|
||||
} else {
|
||||
valid = append(valid, s)
|
||||
}
|
||||
}
|
||||
return valid, skipped
|
||||
}
|
||||
|
||||
// sampleIPMISDRPowerSensors reads power sensors from `ipmitool sdr` in a single
|
||||
// invocation and returns self-healed grouped sums.
|
||||
//
|
||||
// PSU identification delegates to collector.PSUSlotsFromSDR which uses the same
|
||||
// slot-detection regexes as the hardware audit (PSU1_POWER_IN, PSU1_PIN, PS1 POut,
|
||||
// Power1…). Self-healing: bounds checking + 5× median outlier rejection.
|
||||
//
|
||||
// GPU slot sensors (GPU_POWER_SLOTx, GPU1 Power, …) are classified separately
|
||||
// since the audit collector does not track GPU PCIe slot power.
|
||||
func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
|
||||
raw, err := exec.Command("ipmitool", "sdr").Output()
|
||||
if err != nil || len(raw) == 0 {
|
||||
return sdrPowerSnapshot{}
|
||||
}
|
||||
sdrStr := string(raw)
|
||||
var snap sdrPowerSnapshot
|
||||
|
||||
// ── PSU data via audit collector ─────────────────────────────────────────
|
||||
// collector.PSUSlotsFromSDR handles all vendor naming variants and applies
|
||||
// bounds checking inside parseBoundedFloat (0 < w ≤ 6000 W).
|
||||
collectorSlots := collector.PSUSlotsFromSDR(sdrStr)
|
||||
|
||||
// Convert to benchmark type and apply cross-slot median filtering.
|
||||
var psuInSensors, psuOutSensors []sdrSensor
|
||||
for slotKey, sp := range collectorSlots {
|
||||
bsp := BenchmarkPSUSlotPower{Status: sp.Status}
|
||||
if sp.InputW != nil {
|
||||
bsp.InputW = sp.InputW
|
||||
psuInSensors = append(psuInSensors, sdrSensor{name: "PSU-slot-" + slotKey, watts: *sp.InputW})
|
||||
}
|
||||
if sp.OutputW != nil {
|
||||
bsp.OutputW = sp.OutputW
|
||||
psuOutSensors = append(psuOutSensors, sdrSensor{name: "PSU-slot-" + slotKey + "-out", watts: *sp.OutputW})
|
||||
}
|
||||
if snap.PSUSlots == nil {
|
||||
snap.PSUSlots = make(map[string]BenchmarkPSUSlotPower)
|
||||
}
|
||||
snap.PSUSlots[slotKey] = bsp
|
||||
}
|
||||
|
||||
// Apply cross-slot outlier filter and sum.
|
||||
validIn, skIn := filterSensorGroup(psuInSensors, 6000)
|
||||
for _, s := range validIn {
|
||||
snap.PSUInW += s.watts
|
||||
}
|
||||
snap.SkippedSensors = append(snap.SkippedSensors, skIn...)
|
||||
|
||||
validOut, skOut := filterSensorGroup(psuOutSensors, 6000)
|
||||
for _, s := range validOut {
|
||||
snap.PSUOutW += s.watts
|
||||
}
|
||||
snap.SkippedSensors = append(snap.SkippedSensors, skOut...)
|
||||
|
||||
// ── GPU slot sensors ─────────────────────────────────────────────────────
|
||||
// collector does not track GPU PCIe slot power; classify here.
|
||||
// Matches: GPU_POWER_SLOTx (MSI), GPU1 Power (xFusion), GPU_PWR_x (generic).
|
||||
var gpuSensors []sdrSensor
|
||||
for _, line := range strings.Split(sdrStr, "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
nameLower := strings.ToLower(name)
|
||||
if !strings.Contains(nameLower, "gpu") {
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(nameLower, "slot") && !strings.Contains(nameLower, "power") &&
|
||||
!strings.Contains(nameLower, "pwr") {
|
||||
continue
|
||||
}
|
||||
var w float64
|
||||
if n, _ := fmt.Sscanf(strings.TrimSpace(parts[1]), "%f Watts", &w); n != 1 {
|
||||
continue
|
||||
}
|
||||
gpuSensors = append(gpuSensors, sdrSensor{name: name, watts: w})
|
||||
}
|
||||
validGPU, skGPU := filterSensorGroup(gpuSensors, 2000)
|
||||
for _, s := range validGPU {
|
||||
snap.GPUSlotW += s.watts
|
||||
}
|
||||
snap.SkippedSensors = append(snap.SkippedSensors, skGPU...)
|
||||
|
||||
return snap
|
||||
}
|
||||
|
||||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||
func queryIPMIServerPowerW() (float64, error) {
|
||||
@@ -3255,23 +3464,51 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
|
||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||
fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio)
|
||||
fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW)
|
||||
if sp.PSUInputLoadedW > 0 {
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta)
|
||||
}
|
||||
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
// Server power comparison table.
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||
b.WriteString("| Metric | Value |\n")
|
||||
b.WriteString("|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
|
||||
b.WriteString("| Metric | Source | Value |\n")
|
||||
b.WriteString("|--------|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW)
|
||||
fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW)
|
||||
if sp.GPUSlotTotalW > 0 {
|
||||
fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW)
|
||||
}
|
||||
if sp.Available {
|
||||
fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW)
|
||||
}
|
||||
if sp.PSUInputLoadedW > 0 {
|
||||
fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW)
|
||||
fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW)
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
fmt.Fprintf(&b, "| PSU AC input Δ (loaded − idle) | IPMI SDR | %.0f W |\n", psuDelta)
|
||||
}
|
||||
if sp.PSUOutputLoadedW > 0 {
|
||||
fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW)
|
||||
fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW)
|
||||
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
|
||||
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
|
||||
fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff)
|
||||
}
|
||||
}
|
||||
if sp.Available {
|
||||
fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | %.0f W |\n", sp.DeltaW)
|
||||
ratio := sp.ReportingRatio
|
||||
dcmiPartial := detectDCMIPartialCoverage(sp) ||
|
||||
(sp.PSUInputIdleW == 0 && detectIPMISaturationFallback(result.RampSteps))
|
||||
ratioNote := ""
|
||||
switch {
|
||||
case dcmiPartial:
|
||||
ratioNote = "⚠ IPMI DCMI covers partial PSU set; use SDR ratio below for accuracy assessment"
|
||||
case ratio >= 0.9:
|
||||
ratioNote = "✓ GPU telemetry matches server power"
|
||||
case ratio >= 0.75:
|
||||
@@ -3279,14 +3516,83 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
default:
|
||||
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||
}
|
||||
fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
|
||||
fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote)
|
||||
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
sdrRatio := psuDelta / sp.GPUReportedSumW
|
||||
sdrNote := ""
|
||||
switch {
|
||||
case sdrRatio >= 0.9:
|
||||
sdrNote = "✓ GPU telemetry matches wall power"
|
||||
case sdrRatio >= 0.75:
|
||||
sdrNote = "⚠ minor discrepancy"
|
||||
default:
|
||||
sdrNote = "✗ significant discrepancy"
|
||||
}
|
||||
fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote)
|
||||
}
|
||||
} else {
|
||||
b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
|
||||
b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
|
||||
b.WriteString("## PSU Load Distribution\n\n")
|
||||
b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n")
|
||||
b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n")
|
||||
|
||||
// collect all slot keys
|
||||
slotSet := map[string]struct{}{}
|
||||
for k := range sp.PSUSlotReadingsIdle {
|
||||
slotSet[k] = struct{}{}
|
||||
}
|
||||
for k := range sp.PSUSlotReadingsLoaded {
|
||||
slotSet[k] = struct{}{}
|
||||
}
|
||||
slots := make([]string, 0, len(slotSet))
|
||||
for k := range slotSet {
|
||||
slots = append(slots, k)
|
||||
}
|
||||
sort.Strings(slots)
|
||||
|
||||
for _, slot := range slots {
|
||||
idle := sp.PSUSlotReadingsIdle[slot]
|
||||
loaded := sp.PSUSlotReadingsLoaded[slot]
|
||||
|
||||
fmtW := func(v *float64) string {
|
||||
if v == nil {
|
||||
return "—"
|
||||
}
|
||||
return fmt.Sprintf("%.0f W", *v)
|
||||
}
|
||||
|
||||
var deltaStr string
|
||||
if idle.InputW != nil && loaded.InputW != nil {
|
||||
deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
|
||||
} else {
|
||||
deltaStr = "—"
|
||||
}
|
||||
|
||||
status := loaded.Status
|
||||
if status == "" {
|
||||
status = idle.Status
|
||||
}
|
||||
if status == "" {
|
||||
status = "—"
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n",
|
||||
slot,
|
||||
fmtW(idle.InputW), fmtW(loaded.InputW),
|
||||
fmtW(idle.OutputW), fmtW(loaded.OutputW),
|
||||
deltaStr, status,
|
||||
)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
if len(result.Findings) > 0 {
|
||||
@@ -3318,6 +3624,130 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||
if len(result.PSUIssues) > 0 {
|
||||
b.WriteString("## PSU Issues\n\n")
|
||||
b.WriteString("The following power supply anomalies were detected during the test:\n\n")
|
||||
for _, issue := range result.PSUIssues {
|
||||
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Power Distribution Summary ────────────────────────────────────────────
|
||||
b.WriteString("## Power Distribution Summary\n\n")
|
||||
{
|
||||
var totalDefault, totalStable float64
|
||||
for _, gpu := range result.GPUs {
|
||||
stable := gpu.StablePowerLimitW
|
||||
if stable <= 0 {
|
||||
stable = gpu.AppliedPowerLimitW
|
||||
}
|
||||
totalDefault += gpu.DefaultPowerLimitW
|
||||
totalStable += stable
|
||||
}
|
||||
b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n")
|
||||
b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
stable := gpu.StablePowerLimitW
|
||||
if stable <= 0 {
|
||||
stable = gpu.AppliedPowerLimitW
|
||||
}
|
||||
realization := "-"
|
||||
if gpu.DefaultPowerLimitW > 0 && stable > 0 {
|
||||
realization = fmt.Sprintf("%.1f%%", stable/gpu.DefaultPowerLimitW*100)
|
||||
}
|
||||
derated := "-"
|
||||
if gpu.Derated {
|
||||
derated = "⚠ yes"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n",
|
||||
gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated)
|
||||
}
|
||||
platformReal := "-"
|
||||
if totalDefault > 0 && totalStable > 0 {
|
||||
platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n",
|
||||
totalDefault, totalStable, platformReal)
|
||||
|
||||
// Balance across GPUs — only meaningful with 2+ GPUs.
|
||||
if len(result.GPUs) > 1 {
|
||||
var minS, maxS, sumS float64
|
||||
var cnt int
|
||||
for _, gpu := range result.GPUs {
|
||||
s := gpu.StablePowerLimitW
|
||||
if s <= 0 {
|
||||
s = gpu.AppliedPowerLimitW
|
||||
}
|
||||
if s <= 0 {
|
||||
continue
|
||||
}
|
||||
sumS += s
|
||||
cnt++
|
||||
if cnt == 1 || s < minS {
|
||||
minS = s
|
||||
}
|
||||
if s > maxS {
|
||||
maxS = s
|
||||
}
|
||||
}
|
||||
if cnt > 0 {
|
||||
avg := sumS / float64(cnt)
|
||||
spread := (maxS - minS) / avg * 100
|
||||
balanceNote := "✓ balanced"
|
||||
switch {
|
||||
case spread > 20:
|
||||
balanceNote = "⚠ significant imbalance — check slot thermals"
|
||||
case spread > 10:
|
||||
balanceNote = "— minor imbalance"
|
||||
}
|
||||
fmt.Fprintf(&b, "**GPU power balance:** avg %.0f W · min %.0f W · max %.0f W · spread %.1f%% — %s\n\n",
|
||||
avg, minS, maxS, spread, balanceNote)
|
||||
}
|
||||
}
|
||||
|
||||
// Ramp scalability table — power efficiency of adding each GPU.
|
||||
if len(result.RampSteps) > 1 {
|
||||
b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
|
||||
b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n")
|
||||
b.WriteString("|------|------|-----------------------|-------------|---------------------|\n")
|
||||
// First GPU stable TDP as the reference unit for efficiency.
|
||||
var firstStable float64
|
||||
if len(result.GPUs) > 0 {
|
||||
firstStable = result.GPUs[0].StablePowerLimitW
|
||||
if firstStable <= 0 {
|
||||
firstStable = result.GPUs[0].AppliedPowerLimitW
|
||||
}
|
||||
}
|
||||
var prevCumulative float64
|
||||
for _, step := range result.RampSteps {
|
||||
var cumulative float64
|
||||
for _, gpuIdx := range step.GPUIndices {
|
||||
for _, g := range result.GPUs {
|
||||
if g.Index != gpuIdx {
|
||||
continue
|
||||
}
|
||||
s := g.StablePowerLimitW
|
||||
if s <= 0 {
|
||||
s = g.AppliedPowerLimitW
|
||||
}
|
||||
cumulative += s
|
||||
}
|
||||
}
|
||||
incremental := cumulative - prevCumulative
|
||||
efficiency := "—"
|
||||
if step.StepIndex > 1 && firstStable > 0 {
|
||||
efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n",
|
||||
step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency)
|
||||
prevCumulative = cumulative
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
b.WriteString("## Per-Slot Results\n\n")
|
||||
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
|
||||
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
|
||||
@@ -3440,6 +3870,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||
}
|
||||
sdrIdle := sampleIPMISDRPowerSensors()
|
||||
psuBefore := psuStatusSnapshot()
|
||||
|
||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||
@@ -3753,11 +4184,52 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
|
||||
// Characterize server power from IPMI idle/loaded samples.
|
||||
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
|
||||
// ReportingRatio = IPMI_delta / GPU_reported_sum:
|
||||
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
||||
// gpuActualSumW = sum of p95 GPU power from the last ramp step — actual
|
||||
// measured consumption, not the stable limit cap. This is the correct
|
||||
// denominator for the reporting ratio: limit caps (PlatformMaxTDPW) inflate
|
||||
// the denominator and make the ratio appear artificially low.
|
||||
var gpuActualSumW float64
|
||||
if n := len(result.RampSteps); n > 0 {
|
||||
gpuActualSumW = result.RampSteps[n-1].TotalObservedPowerW
|
||||
}
|
||||
if gpuActualSumW <= 0 {
|
||||
gpuActualSumW = result.PlatformMaxTDPW
|
||||
}
|
||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
|
||||
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
||||
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
||||
if result.ServerPower != nil {
|
||||
sdrLoaded := sampleIPMISDRPowerSensors()
|
||||
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
|
||||
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
|
||||
result.ServerPower.PSUOutputIdleW = sdrIdle.PSUOutW
|
||||
result.ServerPower.PSUOutputLoadedW = sdrLoaded.PSUOutW
|
||||
result.ServerPower.GPUSlotTotalW = sdrLoaded.GPUSlotW
|
||||
if len(sdrIdle.PSUSlots) > 0 {
|
||||
result.ServerPower.PSUSlotReadingsIdle = sdrIdle.PSUSlots
|
||||
}
|
||||
if len(sdrLoaded.PSUSlots) > 0 {
|
||||
result.ServerPower.PSUSlotReadingsLoaded = sdrLoaded.PSUSlots
|
||||
}
|
||||
if sdrIdle.PSUInW > 0 && result.ServerPower.IdleW > 0 {
|
||||
result.ServerPower.DCMICoverageRatio = result.ServerPower.IdleW / sdrIdle.PSUInW
|
||||
}
|
||||
if len(sdrLoaded.SkippedSensors) > 0 {
|
||||
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
||||
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
|
||||
}
|
||||
// Detect DCMI partial coverage: direct SDR comparison first,
|
||||
// ramp heuristic as fallback when SDR PSU sensors are absent.
|
||||
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
|
||||
(sdrIdle.PSUInW == 0 && detectIPMISaturationFallback(result.RampSteps))
|
||||
if dcmiUnreliable {
|
||||
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
||||
fmt.Sprintf("IPMI DCMI covers only a subset of installed PSUs (coverage %.0f%%). "+
|
||||
"Use SDR PSU Δ ratio for GPU accuracy assessment; DCMI ratio is not reliable.",
|
||||
result.ServerPower.DCMICoverageRatio*100))
|
||||
}
|
||||
}
|
||||
result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
|
||||
// Write top-level gpu-metrics.csv/.html aggregating all phases.
|
||||
writeBenchmarkMetricsFiles(runDir, allPowerRows)
|
||||
|
||||
@@ -275,18 +275,55 @@ type BenchmarkScorecard struct {
|
||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
||||
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
||||
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
||||
// over-reporting its power consumption.
|
||||
// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
|
||||
// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
|
||||
// so benchmark and audit data can be correlated by slot.
|
||||
type BenchmarkPSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN)
|
||||
OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power from multiple independent
|
||||
// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
|
||||
// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
|
||||
// covers only a subset of installed PSUs (partial coverage).
|
||||
//
|
||||
// Source legend:
|
||||
// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs
|
||||
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||
type BenchmarkServerPower struct {
|
||||
Available bool `json:"available"`
|
||||
IdleW float64 `json:"idle_w,omitempty"`
|
||||
LoadedW float64 `json:"loaded_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
Available bool `json:"available"`
|
||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
|
||||
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||
PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"`
|
||||
PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
|
||||
|
||||
// PSU DC output sum — power delivered to server internals after conversion.
|
||||
PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"`
|
||||
PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
|
||||
|
||||
// Per-slot PSU readings at idle and at peak load.
|
||||
// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
|
||||
PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
|
||||
PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
|
||||
|
||||
// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
|
||||
// PCIe slot delivery only (excludes 16-pin connector power).
|
||||
GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
|
||||
|
||||
// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
|
||||
// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
|
||||
DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
|
||||
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||
|
||||
Reference in New Issue
Block a user