Detect PSU faults during perf and power benchmarks
Snapshot IPMI "Power Supply" sensor states before and after each benchmark run. Compare before/after to surface only *new* anomalies (pre-existing faults are excluded). Results land in NvidiaBenchmarkResult.PSUIssues and NvidiaPowerBenchResult.PSUIssues (JSON: psu_issues) and are printed in the text benchmark report under a "PSU Issues" section. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -286,6 +286,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
||||
psuBefore := psuStatusSnapshot()
|
||||
var metricRows []GPUMetricRow
|
||||
metricTimelineSec := 0.0
|
||||
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
|
||||
@@ -669,6 +670,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
|
||||
result.Findings = buildBenchmarkFindings(result)
|
||||
result.OverallStatus = benchmarkOverallStatus(result)
|
||||
result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
|
||||
writeBenchmarkMetricsFiles(runDir, metricRows)
|
||||
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
@@ -2214,6 +2216,66 @@ func maxInt(a, b int) int {
|
||||
return b
|
||||
}
|
||||
|
||||
// psuStatusSnapshot samples PSU health sensor states via
|
||||
// `ipmitool sdr type "Power Supply"`. Returns a map of sensor name → reading
|
||||
// string (e.g. "Presence detected", "Failure detected"). Returns nil when IPMI
|
||||
// is unavailable or no Power Supply entity sensors are present.
|
||||
func psuStatusSnapshot() map[string]string {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Power Supply").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := make(map[string]string)
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
reading := strings.TrimSpace(parts[4])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
result[name] = reading
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// diffPSUStatus compares PSU sensor snapshots taken before and after a test.
|
||||
// Returns human-readable fault strings for sensors that entered a fault state
|
||||
// during the test. Pre-existing faults (present in both snapshots) are excluded
|
||||
// so that only new anomalies caused by the test are reported.
|
||||
func diffPSUStatus(before, after map[string]string) []string {
|
||||
if len(after) == 0 {
|
||||
return nil
|
||||
}
|
||||
isFault := func(s string) bool {
|
||||
lower := strings.ToLower(s)
|
||||
return strings.Contains(lower, "failure") ||
|
||||
strings.Contains(lower, "fault") ||
|
||||
strings.Contains(lower, "warning") ||
|
||||
strings.Contains(lower, "predictive") ||
|
||||
strings.Contains(lower, "absent") ||
|
||||
strings.Contains(lower, "ac lost")
|
||||
}
|
||||
var issues []string
|
||||
for name, afterReading := range after {
|
||||
if !isFault(afterReading) {
|
||||
continue
|
||||
}
|
||||
if beforeReading, had := before[name]; had && isFault(beforeReading) {
|
||||
continue // pre-existing fault, not caused by this test
|
||||
}
|
||||
if prev, had := before[name]; had {
|
||||
issues = append(issues, fmt.Sprintf("%s: changed from %q to %q during test", name, prev, afterReading))
|
||||
} else {
|
||||
issues = append(issues, fmt.Sprintf("%s: %s (appeared after test start)", name, afterReading))
|
||||
}
|
||||
}
|
||||
sort.Strings(issues)
|
||||
return issues
|
||||
}
|
||||
|
||||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||
func queryIPMIServerPowerW() (float64, error) {
|
||||
@@ -3378,6 +3440,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||
}
|
||||
psuBefore := psuStatusSnapshot()
|
||||
|
||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||
@@ -3695,6 +3758,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
||||
result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
|
||||
// Write top-level gpu-metrics.csv/.html aggregating all phases.
|
||||
writeBenchmarkMetricsFiles(runDir, allPowerRows)
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
|
||||
@@ -383,6 +383,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
}
|
||||
}
|
||||
|
||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||
if len(result.PSUIssues) > 0 {
|
||||
b.WriteString("## PSU Issues\n\n")
|
||||
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||||
for _, issue := range result.PSUIssues {
|
||||
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||
if cooling := result.Cooling; cooling != nil {
|
||||
b.WriteString("## Cooling\n\n")
|
||||
|
||||
@@ -107,6 +107,10 @@ type NvidiaBenchmarkResult struct {
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
@@ -333,6 +337,10 @@ type NvidiaPowerBenchResult struct {
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the power benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchGPU struct {
|
||||
|
||||
Reference in New Issue
Block a user