Add benchmark fan duty cycle summary to report
This commit is contained in:
@@ -401,6 +401,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
||||
}
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
||||
result.Cooling = summarizeBenchmarkCooling(metricRows)
|
||||
|
||||
// Apply server-power penalty when IPMI reports the server delta is much
|
||||
// lower than GPU-reported sum: GPU power telemetry is over-stated, making
|
||||
@@ -739,7 +740,7 @@ func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []
|
||||
if ctx.Err() != nil {
|
||||
return rows, ctx.Err()
|
||||
}
|
||||
samples, err := sampleGPUMetrics(gpuIndices)
|
||||
samples, err := sampleBenchmarkTelemetry(gpuIndices)
|
||||
if err == nil {
|
||||
elapsed := time.Since(start).Seconds()
|
||||
for i := range samples {
|
||||
@@ -774,7 +775,7 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
samples, err := sampleGPUMetrics(gpuIndices)
|
||||
samples, err := sampleBenchmarkTelemetry(gpuIndices)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
@@ -794,6 +795,37 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
|
||||
return out, metricRows, err
|
||||
}
|
||||
|
||||
type benchmarkCoolingSample struct {
|
||||
AvgFanRPM float64
|
||||
AvgFanDutyCyclePct float64
|
||||
FanDutyCycleAvailable bool
|
||||
}
|
||||
|
||||
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
samples, err := sampleGPUMetrics(gpuIndices)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
fanSample := sampleBenchmarkCoolingSample()
|
||||
for i := range samples {
|
||||
samples[i].FanAvgRPM = fanSample.AvgFanRPM
|
||||
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
|
||||
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
|
||||
}
|
||||
return samples, nil
|
||||
}
|
||||
|
||||
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
|
||||
fans, _ := sampleFanSpeeds()
|
||||
avgRPM, _, _ := fanRPMStats(fans)
|
||||
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
|
||||
return benchmarkCoolingSample{
|
||||
AvgFanRPM: avgRPM,
|
||||
AvgFanDutyCyclePct: dutyPct,
|
||||
FanDutyCycleAvailable: dutyAvailable,
|
||||
}
|
||||
}
|
||||
|
||||
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow {
|
||||
if len(rows) == 0 {
|
||||
return nil
|
||||
@@ -1022,6 +1054,37 @@ func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary
|
||||
return summary
|
||||
}
|
||||
|
||||
func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
|
||||
if len(rows) == 0 {
|
||||
return nil
|
||||
}
|
||||
var rpmValues []float64
|
||||
var dutyValues []float64
|
||||
for _, row := range rows {
|
||||
if row.FanAvgRPM > 0 {
|
||||
rpmValues = append(rpmValues, row.FanAvgRPM)
|
||||
}
|
||||
if row.FanDutyCycleAvailable {
|
||||
dutyValues = append(dutyValues, row.FanDutyCyclePct)
|
||||
}
|
||||
}
|
||||
if len(rpmValues) == 0 && len(dutyValues) == 0 {
|
||||
return nil
|
||||
}
|
||||
summary := &BenchmarkCoolingSummary{
|
||||
Available: true,
|
||||
AvgFanRPM: benchmarkMean(rpmValues),
|
||||
}
|
||||
if len(dutyValues) > 0 {
|
||||
summary.FanDutyCycleAvailable = true
|
||||
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
|
||||
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
|
||||
} else {
|
||||
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
|
||||
}
|
||||
return summary
|
||||
}
|
||||
|
||||
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
score := BenchmarkScorecard{}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user