diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index c10a242..bddf377 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -401,6 +401,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples) } result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK) + result.Cooling = summarizeBenchmarkCooling(metricRows) // Apply server-power penalty when IPMI reports the server delta is much // lower than GPU-reported sum: GPU power telemetry is over-stated, making @@ -739,7 +740,7 @@ func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices [] if ctx.Err() != nil { return rows, ctx.Err() } - samples, err := sampleGPUMetrics(gpuIndices) + samples, err := sampleBenchmarkTelemetry(gpuIndices) if err == nil { elapsed := time.Since(start).Seconds() for i := range samples { @@ -774,7 +775,7 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string case <-stopCh: return case <-ticker.C: - samples, err := sampleGPUMetrics(gpuIndices) + samples, err := sampleBenchmarkTelemetry(gpuIndices) if err != nil { continue } @@ -794,6 +795,37 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string return out, metricRows, err } +type benchmarkCoolingSample struct { + AvgFanRPM float64 + AvgFanDutyCyclePct float64 + FanDutyCycleAvailable bool +} + +func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { + samples, err := sampleGPUMetrics(gpuIndices) + if err != nil { + return nil, err + } + fanSample := sampleBenchmarkCoolingSample() + for i := range samples { + samples[i].FanAvgRPM = fanSample.AvgFanRPM + samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct + samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable + } + return samples, nil +} + +func sampleBenchmarkCoolingSample() benchmarkCoolingSample { + fans, _ := sampleFanSpeeds() + avgRPM, _, _ := fanRPMStats(fans) + dutyPct, dutyAvailable := sampleFanDutyCyclePct() + return benchmarkCoolingSample{ + AvgFanRPM: avgRPM, + AvgFanDutyCyclePct: dutyPct, + FanDutyCycleAvailable: dutyAvailable, + } +} + func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow { if len(rows) == 0 { return nil @@ -1022,6 +1054,37 @@ func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary return summary } +func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary { + if len(rows) == 0 { + return nil + } + var rpmValues []float64 + var dutyValues []float64 + for _, row := range rows { + if row.FanAvgRPM > 0 { + rpmValues = append(rpmValues, row.FanAvgRPM) + } + if row.FanDutyCycleAvailable { + dutyValues = append(dutyValues, row.FanDutyCyclePct) + } + } + if len(rpmValues) == 0 && len(dutyValues) == 0 { + return nil + } + summary := &BenchmarkCoolingSummary{ + Available: true, + AvgFanRPM: benchmarkMean(rpmValues), + } + if len(dutyValues) > 0 { + summary.FanDutyCycleAvailable = true + summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues) + summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95) + } else { + summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected") + } + return summary +} + func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score := BenchmarkScorecard{} diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index e07a8d6..0b66d92 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -290,6 +290,31 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { } } + // ── Cooling ─────────────────────────────────────────────────────────────── + if cooling := result.Cooling; cooling != nil { + b.WriteString("## Cooling\n\n") + if cooling.Available { + b.WriteString("| Metric | Value |\n|--------|-------|\n") + fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM) + if cooling.FanDutyCycleAvailable { + fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct) + fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct) + } else { + b.WriteString("| Average fan duty cycle | N/A |\n") + b.WriteString("| P95 fan duty cycle | N/A |\n") + } + b.WriteString("\n") + } else { + b.WriteString("Cooling telemetry unavailable.\n\n") + } + for _, note := range cooling.Notes { + fmt.Fprintf(&b, "- %s\n", note) + } + if len(cooling.Notes) > 0 { + b.WriteString("\n") + } + } + // ── Raw files ───────────────────────────────────────────────────────────── b.WriteString("## Raw Files\n\n") b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n") diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index c57058f..1b2f08a 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -131,6 +131,13 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) { DegradationReasons: []string{"power_capped"}, }, }, + Cooling: &BenchmarkCoolingSummary{ + Available: true, + AvgFanRPM: 9200, + FanDutyCycleAvailable: true, + AvgFanDutyCyclePct: 47.5, + P95FanDutyCyclePct: 62.0, + }, } report := renderBenchmarkReport(result) @@ -140,6 +147,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) { "1176.00", "fp16_tensor", "700.00", + "Cooling", + "Average fan duty cycle", + "47.5%", } { if !strings.Contains(report, needle) { t.Fatalf("report missing %q\n%s", needle, report) diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index b716fcc..bb0690b 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -25,6 +25,17 @@ type BenchmarkCPULoad struct { Note string `json:"note,omitempty"` } +// BenchmarkCoolingSummary captures fan telemetry averaged across the full +// benchmark run. +type BenchmarkCoolingSummary struct { + Available bool `json:"available"` + AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"` + FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` + AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"` + P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"` + Notes []string `json:"notes,omitempty"` +} + const ( NvidiaBenchmarkProfileStandard = "standard" NvidiaBenchmarkProfileStability = "stability" @@ -61,6 +72,7 @@ type NvidiaBenchmarkResult struct { Normalization BenchmarkNormalization `json:"normalization"` HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"` CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"` + Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"` GPUs []BenchmarkGPUResult `json:"gpus"` Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index 5bc0f4b..6d16393 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -13,15 +13,18 @@ import ( // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. type GPUMetricRow struct { - Stage string `json:"stage,omitempty"` - ElapsedSec float64 `json:"elapsed_sec"` - GPUIndex int `json:"index"` - TempC float64 `json:"temp_c"` - UsagePct float64 `json:"usage_pct"` - MemUsagePct float64 `json:"mem_usage_pct"` - PowerW float64 `json:"power_w"` - ClockMHz float64 `json:"clock_mhz"` - MemClockMHz float64 `json:"mem_clock_mhz"` + Stage string `json:"stage,omitempty"` + ElapsedSec float64 `json:"elapsed_sec"` + GPUIndex int `json:"index"` + TempC float64 `json:"temp_c"` + UsagePct float64 `json:"usage_pct"` + MemUsagePct float64 `json:"mem_usage_pct"` + PowerW float64 `json:"power_w"` + ClockMHz float64 `json:"clock_mhz"` + MemClockMHz float64 `json:"mem_clock_mhz"` + FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"` + FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"` + FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` } // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. @@ -142,10 +145,14 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) { // WriteGPUMetricsCSV writes collected rows as a CSV file. func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { var b bytes.Buffer - b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n") + b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n") for _, r := range rows { - fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n", - strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz) + dutyAvail := 0 + if r.FanDutyCycleAvailable { + dutyAvail = 1 + } + fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n", + strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail) } return os.WriteFile(path, b.Bytes(), 0644) } diff --git a/audit/internal/platform/sat_fan_stress.go b/audit/internal/platform/sat_fan_stress.go index 6ec181a..ab5c3a0 100644 --- a/audit/internal/platform/sat_fan_stress.go +++ b/audit/internal/platform/sat_fan_stress.go @@ -426,6 +426,101 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) { return fans, nil } +// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors. +// Returns the average duty cycle across all exposed PWM controls. +func sampleFanDutyCyclePct() (float64, bool) { + out, err := exec.Command("sensors", "-j").Output() + if err != nil || len(out) == 0 { + return 0, false + } + return parseFanDutyCyclePctSensorsJSON(out) +} + +func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) { + var doc map[string]map[string]any + if err := json.Unmarshal(raw, &doc); err != nil { + return 0, false + } + var samples []float64 + for _, features := range doc { + for name, feature := range features { + if strings.EqualFold(name, "Adapter") { + continue + } + featureMap, ok := feature.(map[string]any) + if !ok { + continue + } + if duty, ok := firstFanDutyValue(name, featureMap); ok { + samples = append(samples, duty) + } + } + } + if len(samples) == 0 { + return 0, false + } + return benchmarkMean(samples), true +} + +func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) { + featureName = strings.ToLower(strings.TrimSpace(featureName)) + if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") { + return 0, false + } + if strings.Contains(featureName, "pwm") { + for _, key := range []string{"input", "value", "current"} { + if value, ok := feature[key]; ok { + if duty, parsed := parseFanDutyValue(value); parsed { + return duty, true + } + } + } + } + keys := make([]string, 0, len(feature)) + for key := range feature { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + lower := strings.ToLower(key) + if !strings.Contains(lower, "pwm") { + continue + } + if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") { + continue + } + if duty, parsed := parseFanDutyValue(feature[key]); parsed { + return duty, true + } + } + return 0, false +} + +func parseFanDutyValue(value any) (float64, bool) { + switch v := value.(type) { + case float64: + return normalizePWMAsDutyPct(v) + case string: + if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil { + return normalizePWMAsDutyPct(f) + } + } + return 0, false +} + +func normalizePWMAsDutyPct(raw float64) (float64, bool) { + if raw < 0 { + return 0, false + } + if raw <= 100 { + return raw, true + } + if raw <= 255 { + return raw / 255.0 * 100.0, true + } + return 0, false +} + func firstFanInputValue(feature map[string]any) (float64, bool) { keys := make([]string, 0, len(feature)) for key := range feature { diff --git a/audit/internal/platform/sat_fan_stress_test.go b/audit/internal/platform/sat_fan_stress_test.go index 39de2a6..0439d6f 100644 --- a/audit/internal/platform/sat_fan_stress_test.go +++ b/audit/internal/platform/sat_fan_stress_test.go @@ -29,6 +29,27 @@ func TestFirstFanInputValue(t *testing.T) { } } +func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) { + raw := []byte(`{ + "chip0": { + "fan1": {"input": 9000}, + "pwm1": {"input": 128}, + "pwm1_enable": {"input": 1} + }, + "chip1": { + "pwm2": {"input": 64} + } + }`) + + got, ok := parseFanDutyCyclePctSensorsJSON(raw) + if !ok { + t.Fatalf("expected duty cycle telemetry to be parsed") + } + if got < 57 || got > 58 { + t.Fatalf("got=%v want ~57.1", got) + } +} + func TestParseDCMIPowerReading(t *testing.T) { raw := ` Instantaneous power reading: 512 Watts