diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 0714d77..471b3d7 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -1122,6 +1122,7 @@ type benchmarkCoolingSample struct { AvgFanRPM float64 AvgFanDutyCyclePct float64 FanDutyCycleAvailable bool + FanDutyCycleEstimated bool } func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { @@ -1134,6 +1135,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { samples[i].FanAvgRPM = fanSample.AvgFanRPM samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable + samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated } return samples, nil } @@ -1141,11 +1143,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { func sampleBenchmarkCoolingSample() benchmarkCoolingSample { fans, _ := sampleFanSpeeds() avgRPM, _, _ := fanRPMStats(fans) - dutyPct, dutyAvailable := sampleFanDutyCyclePct() + dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans) return benchmarkCoolingSample{ AvgFanRPM: avgRPM, AvgFanDutyCyclePct: dutyPct, FanDutyCycleAvailable: dutyAvailable, + FanDutyCycleEstimated: dutyEstimated, } } @@ -1387,25 +1390,33 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary { } var rpmValues []float64 var dutyValues []float64 + var dutyEstimated bool for _, row := range rows { if row.FanAvgRPM > 0 { rpmValues = append(rpmValues, row.FanAvgRPM) } if row.FanDutyCycleAvailable { dutyValues = append(dutyValues, row.FanDutyCyclePct) + if row.FanDutyCycleEstimated { + dutyEstimated = true + } } } if len(rpmValues) == 0 && len(dutyValues) == 0 { return nil } summary := &BenchmarkCoolingSummary{ - Available: true, - AvgFanRPM: benchmarkMean(rpmValues), + Available: true, + AvgFanRPM: benchmarkMean(rpmValues), + FanDutyCycleEstimated: dutyEstimated, } if len(dutyValues) > 0 { summary.FanDutyCycleAvailable = true summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues) summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95) + if summary.FanDutyCycleEstimated { + summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading") + } } else { summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected") } diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 8a366a9..c78d9d7 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct { Available bool `json:"available"` AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` + FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"` AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"` P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"` Notes []string `json:"notes,omitempty"` @@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct { } type NvidiaBenchmarkResult struct { - BenchmarkVersion string `json:"benchmark_version"` - GeneratedAt time.Time `json:"generated_at"` - Hostname string `json:"hostname,omitempty"` - ServerModel string `json:"server_model,omitempty"` - BenchmarkProfile string `json:"benchmark_profile"` - ParallelGPUs bool `json:"parallel_gpus,omitempty"` - RampStep int `json:"ramp_step,omitempty"` - RampTotal int `json:"ramp_total,omitempty"` - RampRunID string `json:"ramp_run_id,omitempty"` - ScalabilityScore float64 `json:"scalability_score,omitempty"` + BenchmarkVersion string `json:"benchmark_version"` + GeneratedAt time.Time `json:"generated_at"` + Hostname string `json:"hostname,omitempty"` + ServerModel string `json:"server_model,omitempty"` + BenchmarkProfile string `json:"benchmark_profile"` + ParallelGPUs bool `json:"parallel_gpus,omitempty"` + RampStep int `json:"ramp_step,omitempty"` + RampTotal int `json:"ramp_total,omitempty"` + RampRunID string `json:"ramp_run_id,omitempty"` + ScalabilityScore float64 `json:"scalability_score,omitempty"` // PlatformPowerScore is the mean compute scalability across ramp steps 2..N. // 100% = each added GPU contributes exactly its single-card throughput. // < 100% = throughput loss due to thermal throttle, power limits, or contention. - PlatformPowerScore float64 `json:"platform_power_score,omitempty"` - PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"` - OverallStatus string `json:"overall_status"` - SelectedGPUIndices []int `json:"selected_gpu_indices"` - Findings []string `json:"findings,omitempty"` - Warnings []string `json:"warnings,omitempty"` - Normalization BenchmarkNormalization `json:"normalization"` - HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"` - CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"` - Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"` - GPUs []BenchmarkGPUResult `json:"gpus"` - Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` - ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` + PlatformPowerScore float64 `json:"platform_power_score,omitempty"` + PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"` + OverallStatus string `json:"overall_status"` + SelectedGPUIndices []int `json:"selected_gpu_indices"` + Findings []string `json:"findings,omitempty"` + Warnings []string `json:"warnings,omitempty"` + Normalization BenchmarkNormalization `json:"normalization"` + HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"` + CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"` + Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"` + GPUs []BenchmarkGPUResult `json:"gpus"` + Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` + ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` } type BenchmarkNormalization struct { @@ -223,8 +224,8 @@ type BenchmarkScorecard struct { // Throttle breakdown — percentage of steady-state time in each throttle type. // Used for diagnosis: tells WHY the GPU throttled, not just whether it did. - ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown - PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap + ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown + PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"` // Temperature headroom: distance to the 100°C destruction threshold. @@ -300,22 +301,22 @@ type NvidiaPowerBenchResult struct { // PlatformMaxTDPW is the sum of per-GPU stable power limits found during the // cumulative thermal ramp. Represents the actual sustained power budget of // this server under full GPU load. Use for rack power planning. - PlatformMaxTDPW float64 `json:"platform_max_tdp_w"` + PlatformMaxTDPW float64 `json:"platform_max_tdp_w"` // ServerPower captures IPMI server power delta (idle→loaded) measured in // parallel with the thermal ramp. Use to compare GPU-reported TDP against // actual wall-power draw as seen by the server's power supply. - ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` - Findings []string `json:"findings,omitempty"` - GPUs []NvidiaPowerBenchGPU `json:"gpus"` + ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` + Findings []string `json:"findings,omitempty"` + GPUs []NvidiaPowerBenchGPU `json:"gpus"` } type NvidiaPowerBenchGPU struct { - Index int `json:"index"` - Name string `json:"name,omitempty"` - BusID string `json:"bus_id,omitempty"` - DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` + Index int `json:"index"` + Name string `json:"name,omitempty"` + BusID string `json:"bus_id,omitempty"` + DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` // AppliedPowerLimitW is the stable limit found during single-card calibration. - AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"` + AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"` // StablePowerLimitW is the final fixed limit for this GPU after the // cumulative thermal ramp. This is the limit at which the GPU operated // stably with all other GPUs running simultaneously at their own limits. @@ -333,10 +334,10 @@ type NvidiaPowerBenchGPU struct { } type NvidiaPowerBenchStep struct { - StepIndex int `json:"step_index"` - GPUIndices []int `json:"gpu_indices"` + StepIndex int `json:"step_index"` + GPUIndices []int `json:"gpu_indices"` // NewGPUIndex is the GPU whose stable limit was searched in this step. - NewGPUIndex int `json:"new_gpu_index"` + NewGPUIndex int `json:"new_gpu_index"` // NewGPUStableLimitW is the stable power limit found for the new GPU. NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"` TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"` @@ -349,15 +350,15 @@ type NvidiaPowerBenchStep struct { // NvidiaPerformanceRampStep holds per-step performance data for the // scalability ramp-up phase of the performance benchmark. type NvidiaPerformanceRampStep struct { - StepIndex int `json:"step_index"` - GPUIndices []int `json:"gpu_indices"` + StepIndex int `json:"step_index"` + GPUIndices []int `json:"gpu_indices"` // TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent // TOPS from dedicated single-precision phases) across all GPUs in this step. - TotalSyntheticTOPS float64 `json:"total_synthetic_tops"` - TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"` + TotalSyntheticTOPS float64 `json:"total_synthetic_tops"` + TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"` // ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100. // 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss. - ScalabilityPct float64 `json:"scalability_pct"` - Status string `json:"status"` - Notes []string `json:"notes,omitempty"` + ScalabilityPct float64 `json:"scalability_pct"` + Status string `json:"status"` + Notes []string `json:"notes,omitempty"` } diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index 8d215bc..3840bdd 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -27,6 +27,7 @@ type GPUMetricRow struct { FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"` FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` + FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"` } // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. @@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) { // WriteGPUMetricsCSV writes collected rows as a CSV file. func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { var b bytes.Buffer - b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n") + b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n") for _, r := range rows { dutyAvail := 0 if r.FanDutyCycleAvailable { dutyAvail = 1 } - fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n", - strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail) + dutyEstimated := 0 + if r.FanDutyCycleEstimated { + dutyEstimated = 1 + } + fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n", + strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated) } return os.WriteFile(path, b.Bytes(), 0644) } diff --git a/audit/internal/platform/sat_fan_stress.go b/audit/internal/platform/sat_fan_stress.go index ab5c3a0..28c430a 100644 --- a/audit/internal/platform/sat_fan_stress.go +++ b/audit/internal/platform/sat_fan_stress.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "math" "os" "os/exec" "path/filepath" @@ -56,13 +57,37 @@ type cachedPowerReading struct { UpdatedAt time.Time } +type fanObservationState struct { + MaxRPM map[string]float64 `json:"max_rpm"` +} + +type fanPeakCandidate struct { + FirstSeen time.Time + RPM float64 +} + var ( systemPowerCacheMu sync.Mutex systemPowerCache cachedPowerReading + fanObservationMu sync.Mutex + fanObservation fanObservationState + fanObservationInit bool + fanPeakCandidates = make(map[string]fanPeakCandidate) ) const systemPowerHoldTTL = 15 * time.Second +var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json" + +const fanObservationMinPeakHold = time.Second + +func normalizeObservedFanMaxRPM(rpm float64) float64 { + if rpm <= 0 { + return 0 + } + return math.Ceil(rpm/1000.0) * 1000.0 +} + // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds, // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv. // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling. @@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) { out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output() if err == nil { if fans := parseFanSpeeds(string(out)); len(fans) > 0 { + updateFanObservation(fans, time.Now()) return fans, nil } } fans, sensorsErr := sampleFanSpeedsViaSensorsJSON() if len(fans) > 0 { + updateFanObservation(fans, time.Now()) return fans, nil } if err != nil { @@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) { return nil, sensorsErr } +func loadFanObservationLocked() { + if fanObservationInit { + return + } + fanObservationInit = true + fanObservation.MaxRPM = make(map[string]float64) + raw, err := os.ReadFile(fanObservationStatePath) + if err != nil || len(raw) == 0 { + return + } + var persisted fanObservationState + if json.Unmarshal(raw, &persisted) != nil { + return + } + for name, rpm := range persisted.MaxRPM { + name = strings.TrimSpace(name) + if name == "" || rpm <= 0 { + continue + } + fanObservation.MaxRPM[name] = rpm + } +} + +func saveFanObservationLocked() { + if len(fanObservation.MaxRPM) == 0 { + return + } + dir := filepath.Dir(fanObservationStatePath) + if dir == "" || dir == "." { + dir = "/var/log/bee-sat" + } + if err := os.MkdirAll(dir, 0755); err != nil { + return + } + raw, err := json.MarshalIndent(fanObservation, "", " ") + if err != nil { + return + } + _ = os.WriteFile(fanObservationStatePath, raw, 0644) +} + +func updateFanObservation(fans []FanReading, now time.Time) { + if len(fans) == 0 { + return + } + fanObservationMu.Lock() + defer fanObservationMu.Unlock() + loadFanObservationLocked() + changed := false + for _, fan := range fans { + name := strings.TrimSpace(fan.Name) + if name == "" || fan.RPM <= 0 { + continue + } + currentMax := fanObservation.MaxRPM[name] + if fan.RPM <= currentMax { + delete(fanPeakCandidates, name) + continue + } + if cand, ok := fanPeakCandidates[name]; ok { + if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold { + newMax := math.Max(cand.RPM, fan.RPM) + if newMax > currentMax { + fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax) + changed = true + } + delete(fanPeakCandidates, name) + continue + } + if fan.RPM > cand.RPM { + fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM} + } + continue + } + fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM} + } + if changed { + saveFanObservationLocked() + } +} + +func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) { + if len(fans) == 0 { + return 0, false + } + fanObservationMu.Lock() + defer fanObservationMu.Unlock() + loadFanObservationLocked() + var samples []float64 + for _, fan := range fans { + name := strings.TrimSpace(fan.Name) + if name == "" || fan.RPM <= 0 { + continue + } + maxRPM := fanObservation.MaxRPM[name] + if maxRPM <= 0 { + continue + } + pct := fan.RPM / maxRPM * 100.0 + if pct > 100 { + pct = 100 + } + if pct < 0 { + pct = 0 + } + samples = append(samples, pct) + } + if len(samples) == 0 { + return 0, false + } + return benchmarkMean(samples), true +} + // parseFanSpeeds parses "ipmitool sdr type Fan" output. // Handles two formats: // @@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) { // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors. // Returns the average duty cycle across all exposed PWM controls. -func sampleFanDutyCyclePct() (float64, bool) { +func sampleFanDutyCyclePct() (float64, bool, bool) { out, err := exec.Command("sensors", "-j").Output() if err != nil || len(out) == 0 { - return 0, false + fans, fanErr := sampleFanSpeeds() + if fanErr != nil { + return 0, false, false + } + return sampleFanDutyCyclePctFromFans(fans) } - return parseFanDutyCyclePctSensorsJSON(out) + pct, ok := parseFanDutyCyclePctSensorsJSON(out) + return pct, ok, false +} + +func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) { + if len(fans) == 0 { + return 0, false, false + } + if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok { + return pct, true, true + } + return 0, false, false } func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) { diff --git a/audit/internal/platform/sat_fan_stress_test.go b/audit/internal/platform/sat_fan_stress_test.go index 0439d6f..7b248b4 100644 --- a/audit/internal/platform/sat_fan_stress_test.go +++ b/audit/internal/platform/sat_fan_stress_test.go @@ -1,6 +1,7 @@ package platform import ( + "path/filepath" "testing" "time" ) @@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) { } } +func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) { + t.Parallel() + + oldPath := fanObservationStatePath + oldState := fanObservation + oldInit := fanObservationInit + oldCandidates := fanPeakCandidates + fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json") + fanObservation = fanObservationState{} + fanObservationInit = false + fanPeakCandidates = make(map[string]fanPeakCandidate) + t.Cleanup(func() { + fanObservationStatePath = oldPath + fanObservation = oldState + fanObservationInit = oldInit + fanPeakCandidates = oldCandidates + }) + + start := time.Unix(100, 0) + updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start) + if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok { + t.Fatalf("single-sample spike should not establish observed max") + } + + updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond)) + updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond)) + + got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}}) + if !ok { + t.Fatalf("expected estimated duty cycle from persisted observed max") + } + if got < 43 || got > 44 { + t.Fatalf("got=%v want ~43.3", got) + } + + fanObservation = fanObservationState{} + fanObservationInit = false + fanPeakCandidates = make(map[string]fanPeakCandidate) + got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}}) + if !ok { + t.Fatalf("expected persisted observed max to be reloaded from disk") + } + if got < 43 || got > 44 { + t.Fatalf("reloaded got=%v want ~43.3", got) + } +} + func TestParseDCMIPowerReading(t *testing.T) { raw := ` Instantaneous power reading: 512 Watts