From 028bb30333d986f72c1d5dbe819b1af412120259 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sat, 18 Apr 2026 15:08:41 +0300 Subject: [PATCH] Detect PSU faults during perf and power benchmarks Snapshot IPMI "Power Supply" sensor states before and after each benchmark run. Compare before/after to surface only *new* anomalies (pre-existing faults are excluded). Results land in NvidiaBenchmarkResult.PSUIssues and NvidiaPowerBenchResult.PSUIssues (JSON: psu_issues) and are printed in the text benchmark report under a "PSU Issues" section. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 64 +++++++++++++++++++++ audit/internal/platform/benchmark_report.go | 10 ++++ audit/internal/platform/benchmark_types.go | 8 +++ 3 files changed, 82 insertions(+) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 48b863f..96a9ede 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -286,6 +286,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) + psuBefore := psuStatusSnapshot() var metricRows []GPUMetricRow metricTimelineSec := 0.0 gpuBurnLog := filepath.Join(runDir, "gpu-burn.log") @@ -669,6 +670,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv result.Findings = buildBenchmarkFindings(result) result.OverallStatus = benchmarkOverallStatus(result) + result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot()) writeBenchmarkMetricsFiles(runDir, metricRows) resultJSON, err := json.MarshalIndent(result, "", " ") @@ -2214,6 +2216,66 @@ func maxInt(a, b int) int { return b } +// psuStatusSnapshot samples PSU health sensor states via +// `ipmitool sdr type "Power Supply"`. Returns a map of sensor name → reading +// string (e.g. "Presence detected", "Failure detected"). Returns nil when IPMI +// is unavailable or no Power Supply entity sensors are present. +func psuStatusSnapshot() map[string]string { + out, err := exec.Command("ipmitool", "sdr", "type", "Power Supply").Output() + if err != nil || len(out) == 0 { + return nil + } + result := make(map[string]string) + for _, line := range strings.Split(string(out), "\n") { + parts := strings.Split(line, "|") + if len(parts) < 5 { + continue + } + name := strings.TrimSpace(parts[0]) + reading := strings.TrimSpace(parts[4]) + if name == "" { + continue + } + result[name] = reading + } + return result +} + +// diffPSUStatus compares PSU sensor snapshots taken before and after a test. +// Returns human-readable fault strings for sensors that entered a fault state +// during the test. Pre-existing faults (present in both snapshots) are excluded +// so that only new anomalies caused by the test are reported. +func diffPSUStatus(before, after map[string]string) []string { + if len(after) == 0 { + return nil + } + isFault := func(s string) bool { + lower := strings.ToLower(s) + return strings.Contains(lower, "failure") || + strings.Contains(lower, "fault") || + strings.Contains(lower, "warning") || + strings.Contains(lower, "predictive") || + strings.Contains(lower, "absent") || + strings.Contains(lower, "ac lost") + } + var issues []string + for name, afterReading := range after { + if !isFault(afterReading) { + continue + } + if beforeReading, had := before[name]; had && isFault(beforeReading) { + continue // pre-existing fault, not caused by this test + } + if prev, had := before[name]; had { + issues = append(issues, fmt.Sprintf("%s: changed from %q to %q during test", name, prev, afterReading)) + } else { + issues = append(issues, fmt.Sprintf("%s: %s (appeared after test start)", name, afterReading)) + } + } + sort.Strings(issues) + return issues +} + // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi. // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed. func queryIPMIServerPowerW() (float64, error) { @@ -3378,6 +3440,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N serverIdleOK = true logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) } + psuBefore := psuStatusSnapshot() // Phase 1: calibrate each GPU individually (sequentially, one at a time) to // establish a true single-card power baseline unaffected by neighbour heat. @@ -3695,6 +3758,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP. _ = serverIdleOK // used implicitly via characterizeServerPower result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK) + result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot()) // Write top-level gpu-metrics.csv/.html aggregating all phases. writeBenchmarkMetricsFiles(runDir, allPowerRows) resultJSON, err := json.MarshalIndent(result, "", " ") diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index c285234..646d3b1 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -383,6 +383,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { } } + // ── PSU Issues ──────────────────────────────────────────────────────────── + if len(result.PSUIssues) > 0 { + b.WriteString("## PSU Issues\n\n") + b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n") + for _, issue := range result.PSUIssues { + fmt.Fprintf(&b, "- ⛔ %s\n", issue) + } + b.WriteString("\n") + } + // ── Cooling ─────────────────────────────────────────────────────────────── if cooling := result.Cooling; cooling != nil { b.WriteString("## Cooling\n\n") diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 6896589..a1caaf4 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -107,6 +107,10 @@ type NvidiaBenchmarkResult struct { GPUs []BenchmarkGPUResult `json:"gpus"` Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` + // PSUIssues holds power supply fault events detected by comparing IPMI PSU + // sensor states before and after the benchmark run. Empty when IPMI is + // unavailable or no PSU faults occurred during the test. + PSUIssues []string `json:"psu_issues,omitempty"` } type BenchmarkNormalization struct { @@ -333,6 +337,10 @@ type NvidiaPowerBenchResult struct { ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` Findings []string `json:"findings,omitempty"` GPUs []NvidiaPowerBenchGPU `json:"gpus"` + // PSUIssues holds power supply fault events detected by comparing IPMI PSU + // sensor states before and after the power benchmark run. Empty when IPMI is + // unavailable or no PSU faults occurred during the test. + PSUIssues []string `json:"psu_issues,omitempty"` } type NvidiaPowerBenchGPU struct {