Compare commits

..

1 Commits
v8.18 ... v8.21

Author SHA1 Message Date
Mikhail Chusavitin
18e24a9aa5 Estimate fan duty from observed RPM maxima 2026-04-16 10:10:18 +03:00
5 changed files with 273 additions and 53 deletions

View File

@@ -1122,6 +1122,7 @@ type benchmarkCoolingSample struct {
AvgFanRPM float64 AvgFanRPM float64
AvgFanDutyCyclePct float64 AvgFanDutyCyclePct float64
FanDutyCycleAvailable bool FanDutyCycleAvailable bool
FanDutyCycleEstimated bool
} }
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
@@ -1134,6 +1135,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
samples[i].FanAvgRPM = fanSample.AvgFanRPM samples[i].FanAvgRPM = fanSample.AvgFanRPM
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
} }
return samples, nil return samples, nil
} }
@@ -1141,11 +1143,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
func sampleBenchmarkCoolingSample() benchmarkCoolingSample { func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
fans, _ := sampleFanSpeeds() fans, _ := sampleFanSpeeds()
avgRPM, _, _ := fanRPMStats(fans) avgRPM, _, _ := fanRPMStats(fans)
dutyPct, dutyAvailable := sampleFanDutyCyclePct() dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
return benchmarkCoolingSample{ return benchmarkCoolingSample{
AvgFanRPM: avgRPM, AvgFanRPM: avgRPM,
AvgFanDutyCyclePct: dutyPct, AvgFanDutyCyclePct: dutyPct,
FanDutyCycleAvailable: dutyAvailable, FanDutyCycleAvailable: dutyAvailable,
FanDutyCycleEstimated: dutyEstimated,
} }
} }
@@ -1387,25 +1390,33 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
} }
var rpmValues []float64 var rpmValues []float64
var dutyValues []float64 var dutyValues []float64
var dutyEstimated bool
for _, row := range rows { for _, row := range rows {
if row.FanAvgRPM > 0 { if row.FanAvgRPM > 0 {
rpmValues = append(rpmValues, row.FanAvgRPM) rpmValues = append(rpmValues, row.FanAvgRPM)
} }
if row.FanDutyCycleAvailable { if row.FanDutyCycleAvailable {
dutyValues = append(dutyValues, row.FanDutyCyclePct) dutyValues = append(dutyValues, row.FanDutyCyclePct)
if row.FanDutyCycleEstimated {
dutyEstimated = true
}
} }
} }
if len(rpmValues) == 0 && len(dutyValues) == 0 { if len(rpmValues) == 0 && len(dutyValues) == 0 {
return nil return nil
} }
summary := &BenchmarkCoolingSummary{ summary := &BenchmarkCoolingSummary{
Available: true, Available: true,
AvgFanRPM: benchmarkMean(rpmValues), AvgFanRPM: benchmarkMean(rpmValues),
FanDutyCycleEstimated: dutyEstimated,
} }
if len(dutyValues) > 0 { if len(dutyValues) > 0 {
summary.FanDutyCycleAvailable = true summary.FanDutyCycleAvailable = true
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues) summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95) summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
if summary.FanDutyCycleEstimated {
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
}
} else { } else {
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected") summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
} }

View File

@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
Available bool `json:"available"` Available bool `json:"available"`
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"` AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"` AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"` P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
@@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct {
} }
type NvidiaBenchmarkResult struct { type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"` BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"` GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"` Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"` ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"` BenchmarkProfile string `json:"benchmark_profile"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"` ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"` RampStep int `json:"ramp_step,omitempty"`
RampTotal int `json:"ramp_total,omitempty"` RampTotal int `json:"ramp_total,omitempty"`
RampRunID string `json:"ramp_run_id,omitempty"` RampRunID string `json:"ramp_run_id,omitempty"`
ScalabilityScore float64 `json:"scalability_score,omitempty"` ScalabilityScore float64 `json:"scalability_score,omitempty"`
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N. // PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
// 100% = each added GPU contributes exactly its single-card throughput. // 100% = each added GPU contributes exactly its single-card throughput.
// < 100% = throughput loss due to thermal throttle, power limits, or contention. // < 100% = throughput loss due to thermal throttle, power limits, or contention.
PlatformPowerScore float64 `json:"platform_power_score,omitempty"` PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"` PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"` OverallStatus string `json:"overall_status"`
SelectedGPUIndices []int `json:"selected_gpu_indices"` SelectedGPUIndices []int `json:"selected_gpu_indices"`
Findings []string `json:"findings,omitempty"` Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"` Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"` Normalization BenchmarkNormalization `json:"normalization"`
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"` HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"` CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"` Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
GPUs []BenchmarkGPUResult `json:"gpus"` GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
} }
type BenchmarkNormalization struct { type BenchmarkNormalization struct {
@@ -223,8 +224,8 @@ type BenchmarkScorecard struct {
// Throttle breakdown — percentage of steady-state time in each throttle type. // Throttle breakdown — percentage of steady-state time in each throttle type.
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did. // Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"` SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
// Temperature headroom: distance to the 100°C destruction threshold. // Temperature headroom: distance to the 100°C destruction threshold.
@@ -300,22 +301,22 @@ type NvidiaPowerBenchResult struct {
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the // PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
// cumulative thermal ramp. Represents the actual sustained power budget of // cumulative thermal ramp. Represents the actual sustained power budget of
// this server under full GPU load. Use for rack power planning. // this server under full GPU load. Use for rack power planning.
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"` PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
// ServerPower captures IPMI server power delta (idle→loaded) measured in // ServerPower captures IPMI server power delta (idle→loaded) measured in
// parallel with the thermal ramp. Use to compare GPU-reported TDP against // parallel with the thermal ramp. Use to compare GPU-reported TDP against
// actual wall-power draw as seen by the server's power supply. // actual wall-power draw as seen by the server's power supply.
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
Findings []string `json:"findings,omitempty"` Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"` GPUs []NvidiaPowerBenchGPU `json:"gpus"`
} }
type NvidiaPowerBenchGPU struct { type NvidiaPowerBenchGPU struct {
Index int `json:"index"` Index int `json:"index"`
Name string `json:"name,omitempty"` Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"` BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// AppliedPowerLimitW is the stable limit found during single-card calibration. // AppliedPowerLimitW is the stable limit found during single-card calibration.
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"` AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
// StablePowerLimitW is the final fixed limit for this GPU after the // StablePowerLimitW is the final fixed limit for this GPU after the
// cumulative thermal ramp. This is the limit at which the GPU operated // cumulative thermal ramp. This is the limit at which the GPU operated
// stably with all other GPUs running simultaneously at their own limits. // stably with all other GPUs running simultaneously at their own limits.
@@ -333,10 +334,10 @@ type NvidiaPowerBenchGPU struct {
} }
type NvidiaPowerBenchStep struct { type NvidiaPowerBenchStep struct {
StepIndex int `json:"step_index"` StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"` GPUIndices []int `json:"gpu_indices"`
// NewGPUIndex is the GPU whose stable limit was searched in this step. // NewGPUIndex is the GPU whose stable limit was searched in this step.
NewGPUIndex int `json:"new_gpu_index"` NewGPUIndex int `json:"new_gpu_index"`
// NewGPUStableLimitW is the stable power limit found for the new GPU. // NewGPUStableLimitW is the stable power limit found for the new GPU.
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"` NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"` TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
@@ -349,15 +350,15 @@ type NvidiaPowerBenchStep struct {
// NvidiaPerformanceRampStep holds per-step performance data for the // NvidiaPerformanceRampStep holds per-step performance data for the
// scalability ramp-up phase of the performance benchmark. // scalability ramp-up phase of the performance benchmark.
type NvidiaPerformanceRampStep struct { type NvidiaPerformanceRampStep struct {
StepIndex int `json:"step_index"` StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"` GPUIndices []int `json:"gpu_indices"`
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent // TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
// TOPS from dedicated single-precision phases) across all GPUs in this step. // TOPS from dedicated single-precision phases) across all GPUs in this step.
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"` TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"` TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100. // ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss. // 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
ScalabilityPct float64 `json:"scalability_pct"` ScalabilityPct float64 `json:"scalability_pct"`
Status string `json:"status"` Status string `json:"status"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
} }

View File

@@ -27,6 +27,7 @@ type GPUMetricRow struct {
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"` FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"` FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
} }
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file. // WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer var b bytes.Buffer
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n") b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
for _, r := range rows { for _, r := range rows {
dutyAvail := 0 dutyAvail := 0
if r.FanDutyCycleAvailable { if r.FanDutyCycleAvailable {
dutyAvail = 1 dutyAvail = 1
} }
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n", dutyEstimated := 0
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail) if r.FanDutyCycleEstimated {
dutyEstimated = 1
}
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
} }
return os.WriteFile(path, b.Bytes(), 0644) return os.WriteFile(path, b.Bytes(), 0644)
} }

View File

@@ -4,6 +4,7 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"math"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
UpdatedAt time.Time UpdatedAt time.Time
} }
type fanObservationState struct {
MaxRPM map[string]float64 `json:"max_rpm"`
}
type fanPeakCandidate struct {
FirstSeen time.Time
RPM float64
}
var ( var (
systemPowerCacheMu sync.Mutex systemPowerCacheMu sync.Mutex
systemPowerCache cachedPowerReading systemPowerCache cachedPowerReading
fanObservationMu sync.Mutex
fanObservation fanObservationState
fanObservationInit bool
fanPeakCandidates = make(map[string]fanPeakCandidate)
) )
const systemPowerHoldTTL = 15 * time.Second const systemPowerHoldTTL = 15 * time.Second
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
const fanObservationMinPeakHold = time.Second
func normalizeObservedFanMaxRPM(rpm float64) float64 {
if rpm <= 0 {
return 0
}
return math.Ceil(rpm/1000.0) * 1000.0
}
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds, // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv. // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling. // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output() out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
if err == nil { if err == nil {
if fans := parseFanSpeeds(string(out)); len(fans) > 0 { if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil return fans, nil
} }
} }
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON() fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
if len(fans) > 0 { if len(fans) > 0 {
updateFanObservation(fans, time.Now())
return fans, nil return fans, nil
} }
if err != nil { if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
return nil, sensorsErr return nil, sensorsErr
} }
func loadFanObservationLocked() {
if fanObservationInit {
return
}
fanObservationInit = true
fanObservation.MaxRPM = make(map[string]float64)
raw, err := os.ReadFile(fanObservationStatePath)
if err != nil || len(raw) == 0 {
return
}
var persisted fanObservationState
if json.Unmarshal(raw, &persisted) != nil {
return
}
for name, rpm := range persisted.MaxRPM {
name = strings.TrimSpace(name)
if name == "" || rpm <= 0 {
continue
}
fanObservation.MaxRPM[name] = rpm
}
}
func saveFanObservationLocked() {
if len(fanObservation.MaxRPM) == 0 {
return
}
dir := filepath.Dir(fanObservationStatePath)
if dir == "" || dir == "." {
dir = "/var/log/bee-sat"
}
if err := os.MkdirAll(dir, 0755); err != nil {
return
}
raw, err := json.MarshalIndent(fanObservation, "", " ")
if err != nil {
return
}
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
}
func updateFanObservation(fans []FanReading, now time.Time) {
if len(fans) == 0 {
return
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
changed := false
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
currentMax := fanObservation.MaxRPM[name]
if fan.RPM <= currentMax {
delete(fanPeakCandidates, name)
continue
}
if cand, ok := fanPeakCandidates[name]; ok {
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
newMax := math.Max(cand.RPM, fan.RPM)
if newMax > currentMax {
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
changed = true
}
delete(fanPeakCandidates, name)
continue
}
if fan.RPM > cand.RPM {
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
}
continue
}
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
}
if changed {
saveFanObservationLocked()
}
}
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
if len(fans) == 0 {
return 0, false
}
fanObservationMu.Lock()
defer fanObservationMu.Unlock()
loadFanObservationLocked()
var samples []float64
for _, fan := range fans {
name := strings.TrimSpace(fan.Name)
if name == "" || fan.RPM <= 0 {
continue
}
maxRPM := fanObservation.MaxRPM[name]
if maxRPM <= 0 {
continue
}
pct := fan.RPM / maxRPM * 100.0
if pct > 100 {
pct = 100
}
if pct < 0 {
pct = 0
}
samples = append(samples, pct)
}
if len(samples) == 0 {
return 0, false
}
return benchmarkMean(samples), true
}
// parseFanSpeeds parses "ipmitool sdr type Fan" output. // parseFanSpeeds parses "ipmitool sdr type Fan" output.
// Handles two formats: // Handles two formats:
// //
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors. // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
// Returns the average duty cycle across all exposed PWM controls. // Returns the average duty cycle across all exposed PWM controls.
func sampleFanDutyCyclePct() (float64, bool) { func sampleFanDutyCyclePct() (float64, bool, bool) {
out, err := exec.Command("sensors", "-j").Output() out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 { if err != nil || len(out) == 0 {
return 0, false fans, fanErr := sampleFanSpeeds()
if fanErr != nil {
return 0, false, false
}
return sampleFanDutyCyclePctFromFans(fans)
} }
return parseFanDutyCyclePctSensorsJSON(out) pct, ok := parseFanDutyCyclePctSensorsJSON(out)
return pct, ok, false
}
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
if len(fans) == 0 {
return 0, false, false
}
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
return pct, true, true
}
return 0, false, false
} }
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) { func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {

View File

@@ -1,6 +1,7 @@
package platform package platform
import ( import (
"path/filepath"
"testing" "testing"
"time" "time"
) )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
} }
} }
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
t.Parallel()
oldPath := fanObservationStatePath
oldState := fanObservation
oldInit := fanObservationInit
oldCandidates := fanPeakCandidates
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
t.Cleanup(func() {
fanObservationStatePath = oldPath
fanObservation = oldState
fanObservationInit = oldInit
fanPeakCandidates = oldCandidates
})
start := time.Unix(100, 0)
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
t.Fatalf("single-sample spike should not establish observed max")
}
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected estimated duty cycle from persisted observed max")
}
if got < 43 || got > 44 {
t.Fatalf("got=%v want ~43.3", got)
}
fanObservation = fanObservationState{}
fanObservationInit = false
fanPeakCandidates = make(map[string]fanPeakCandidate)
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
if !ok {
t.Fatalf("expected persisted observed max to be reloaded from disk")
}
if got < 43 || got > 44 {
t.Fatalf("reloaded got=%v want ~43.3", got)
}
}
func TestParseDCMIPowerReading(t *testing.T) { func TestParseDCMIPowerReading(t *testing.T) {
raw := ` raw := `
Instantaneous power reading: 512 Watts Instantaneous power reading: 512 Watts