Detect PSU faults during perf and power benchmarks
Snapshot IPMI "Power Supply" sensor states before and after each benchmark run. Compare before/after to surface only *new* anomalies (pre-existing faults are excluded). Results land in NvidiaBenchmarkResult.PSUIssues and NvidiaPowerBenchResult.PSUIssues (JSON: psu_issues) and are printed in the text benchmark report under a "PSU Issues" section. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -286,6 +286,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
|
|
||||||
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
||||||
|
psuBefore := psuStatusSnapshot()
|
||||||
var metricRows []GPUMetricRow
|
var metricRows []GPUMetricRow
|
||||||
metricTimelineSec := 0.0
|
metricTimelineSec := 0.0
|
||||||
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
|
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
|
||||||
@@ -669,6 +670,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
|
|
||||||
result.Findings = buildBenchmarkFindings(result)
|
result.Findings = buildBenchmarkFindings(result)
|
||||||
result.OverallStatus = benchmarkOverallStatus(result)
|
result.OverallStatus = benchmarkOverallStatus(result)
|
||||||
|
result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
|
||||||
writeBenchmarkMetricsFiles(runDir, metricRows)
|
writeBenchmarkMetricsFiles(runDir, metricRows)
|
||||||
|
|
||||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
@@ -2214,6 +2216,66 @@ func maxInt(a, b int) int {
|
|||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// psuStatusSnapshot samples PSU health sensor states via
|
||||||
|
// `ipmitool sdr type "Power Supply"`. Returns a map of sensor name → reading
|
||||||
|
// string (e.g. "Presence detected", "Failure detected"). Returns nil when IPMI
|
||||||
|
// is unavailable or no Power Supply entity sensors are present.
|
||||||
|
func psuStatusSnapshot() map[string]string {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "type", "Power Supply").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
result := make(map[string]string)
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 5 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(parts[0])
|
||||||
|
reading := strings.TrimSpace(parts[4])
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result[name] = reading
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// diffPSUStatus compares PSU sensor snapshots taken before and after a test.
|
||||||
|
// Returns human-readable fault strings for sensors that entered a fault state
|
||||||
|
// during the test. Pre-existing faults (present in both snapshots) are excluded
|
||||||
|
// so that only new anomalies caused by the test are reported.
|
||||||
|
func diffPSUStatus(before, after map[string]string) []string {
|
||||||
|
if len(after) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
isFault := func(s string) bool {
|
||||||
|
lower := strings.ToLower(s)
|
||||||
|
return strings.Contains(lower, "failure") ||
|
||||||
|
strings.Contains(lower, "fault") ||
|
||||||
|
strings.Contains(lower, "warning") ||
|
||||||
|
strings.Contains(lower, "predictive") ||
|
||||||
|
strings.Contains(lower, "absent") ||
|
||||||
|
strings.Contains(lower, "ac lost")
|
||||||
|
}
|
||||||
|
var issues []string
|
||||||
|
for name, afterReading := range after {
|
||||||
|
if !isFault(afterReading) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if beforeReading, had := before[name]; had && isFault(beforeReading) {
|
||||||
|
continue // pre-existing fault, not caused by this test
|
||||||
|
}
|
||||||
|
if prev, had := before[name]; had {
|
||||||
|
issues = append(issues, fmt.Sprintf("%s: changed from %q to %q during test", name, prev, afterReading))
|
||||||
|
} else {
|
||||||
|
issues = append(issues, fmt.Sprintf("%s: %s (appeared after test start)", name, afterReading))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Strings(issues)
|
||||||
|
return issues
|
||||||
|
}
|
||||||
|
|
||||||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||||
func queryIPMIServerPowerW() (float64, error) {
|
func queryIPMIServerPowerW() (float64, error) {
|
||||||
@@ -3378,6 +3440,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
serverIdleOK = true
|
serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||||
}
|
}
|
||||||
|
psuBefore := psuStatusSnapshot()
|
||||||
|
|
||||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||||
@@ -3695,6 +3758,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
||||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
||||||
|
result.PSUIssues = diffPSUStatus(psuBefore, psuStatusSnapshot())
|
||||||
// Write top-level gpu-metrics.csv/.html aggregating all phases.
|
// Write top-level gpu-metrics.csv/.html aggregating all phases.
|
||||||
writeBenchmarkMetricsFiles(runDir, allPowerRows)
|
writeBenchmarkMetricsFiles(runDir, allPowerRows)
|
||||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
|
|||||||
@@ -383,6 +383,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||||
|
if len(result.PSUIssues) > 0 {
|
||||||
|
b.WriteString("## PSU Issues\n\n")
|
||||||
|
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||||||
|
for _, issue := range result.PSUIssues {
|
||||||
|
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||||
if cooling := result.Cooling; cooling != nil {
|
if cooling := result.Cooling; cooling != nil {
|
||||||
b.WriteString("## Cooling\n\n")
|
b.WriteString("## Cooling\n\n")
|
||||||
|
|||||||
@@ -107,6 +107,10 @@ type NvidiaBenchmarkResult struct {
|
|||||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
|
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||||
|
// sensor states before and after the benchmark run. Empty when IPMI is
|
||||||
|
// unavailable or no PSU faults occurred during the test.
|
||||||
|
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkNormalization struct {
|
type BenchmarkNormalization struct {
|
||||||
@@ -333,6 +337,10 @@ type NvidiaPowerBenchResult struct {
|
|||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
|
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||||
|
// sensor states before and after the power benchmark run. Empty when IPMI is
|
||||||
|
// unavailable or no PSU faults occurred during the test.
|
||||||
|
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchGPU struct {
|
type NvidiaPowerBenchGPU struct {
|
||||||
|
|||||||
Reference in New Issue
Block a user