Detect PSU faults during perf and power benchmarks

Snapshot IPMI "Power Supply" sensor states before and after each benchmark
run. Compare before/after to surface only *new* anomalies (pre-existing faults
are excluded). Results land in NvidiaBenchmarkResult.PSUIssues and
NvidiaPowerBenchResult.PSUIssues (JSON: psu_issues) and are printed in the
text benchmark report under a "PSU Issues" section.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 15:08:41 +03:00
parent 7d64e5d215
commit 028bb30333
3 changed files with 82 additions and 0 deletions

View File

@@ -286,6 +286,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
psuBefore := psuStatusSnapshot()
var metricRows []GPUMetricRow
metricTimelineSec := 0.0
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
@@ -669,6 +670,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result)
result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
writeBenchmarkMetricsFiles(runDir, metricRows)
resultJSON, err := json.MarshalIndent(result, "", " ")
@@ -2214,6 +2216,66 @@ func maxInt(a, b int) int {
return b
}
// psuStatusSnapshot samples PSU health sensor states via
// `ipmitool sdr type "Power Supply"`. Returns a map of sensor name → reading
// string (e.g. "Presence detected", "Failure detected"). Returns nil when IPMI
// is unavailable or no Power Supply entity sensors are present.
func psuStatusSnapshot() map[string]string {
out, err := exec.Command("ipmitool", "sdr", "type", "Power Supply").Output()
if err != nil || len(out) == 0 {
return nil
}
result := make(map[string]string)
for _, line := range strings.Split(string(out), "\n") {
parts := strings.Split(line, "|")
if len(parts) < 5 {
continue
}
name := strings.TrimSpace(parts[0])
reading := strings.TrimSpace(parts[4])
if name == "" {
continue
}
result[name] = reading
}
return result
}
// diffPSUStatus compares PSU sensor snapshots taken before and after a test.
// Returns human-readable fault strings for sensors that entered a fault state
// during the test. Pre-existing faults (present in both snapshots) are excluded
// so that only new anomalies caused by the test are reported.
func diffPSUStatus(before, after map[string]string) []string {
if len(after) == 0 {
return nil
}
isFault := func(s string) bool {
lower := strings.ToLower(s)
return strings.Contains(lower, "failure") ||
strings.Contains(lower, "fault") ||
strings.Contains(lower, "warning") ||
strings.Contains(lower, "predictive") ||
strings.Contains(lower, "absent") ||
strings.Contains(lower, "ac lost")
}
var issues []string
for name, afterReading := range after {
if !isFault(afterReading) {
continue
}
if beforeReading, had := before[name]; had && isFault(beforeReading) {
continue // pre-existing fault, not caused by this test
}
if prev, had := before[name]; had {
issues = append(issues, fmt.Sprintf("%s: changed from %q to %q during test", name, prev, afterReading))
} else {
issues = append(issues, fmt.Sprintf("%s: %s (appeared after test start)", name, afterReading))
}
}
sort.Strings(issues)
return issues
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
@@ -3378,6 +3440,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
psuBefore := psuStatusSnapshot()
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
@@ -3695,6 +3758,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
_ = serverIdleOK // used implicitly via characterizeServerPower
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
// Write top-level gpu-metrics.csv/.html aggregating all phases.
writeBenchmarkMetricsFiles(runDir, allPowerRows)
resultJSON, err := json.MarshalIndent(result, "", " ")

View File

@@ -383,6 +383,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
}
}
// ── PSU Issues ────────────────────────────────────────────────────────────
if len(result.PSUIssues) > 0 {
b.WriteString("## PSU Issues\n\n")
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
for _, issue := range result.PSUIssues {
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
}
b.WriteString("\n")
}
// ── Cooling ───────────────────────────────────────────────────────────────
if cooling := result.Cooling; cooling != nil {
b.WriteString("## Cooling\n\n")

View File

@@ -107,6 +107,10 @@ type NvidiaBenchmarkResult struct {
GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
// sensor states before and after the benchmark run. Empty when IPMI is
// unavailable or no PSU faults occurred during the test.
PSUIssues []string `json:"psu_issues,omitempty"`
}
type BenchmarkNormalization struct {
@@ -333,6 +337,10 @@ type NvidiaPowerBenchResult struct {
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
// sensor states before and after the power benchmark run. Empty when IPMI is
// unavailable or no PSU faults occurred during the test.
PSUIssues []string `json:"psu_issues,omitempty"`
}
type NvidiaPowerBenchGPU struct {