Fix DCGM cleanup and shorten memory validate
This commit is contained in:
@@ -1399,44 +1399,40 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
case score.MixedScore > 0:
|
||||
score.ComputeScore = score.MixedScore
|
||||
}
|
||||
// PowerSustainScore: measures how close the GPU came to its rated TDP during
|
||||
// steady-state benchmark load. 100 = exactly at rated TDP.
|
||||
// Penalty applied symmetrically for both under- and over-TDP deviations:
|
||||
// score = max(0, 100 − |measured − rated| / rated × 100)
|
||||
// Under-TDP → power delivery / cooling issue.
|
||||
// Over-TDP → power limit not properly enforced / power regulation fault.
|
||||
// Uses CalibratedPeakPowerW when available (from external power calibration),
|
||||
// otherwise falls back to Steady.AvgPowerW observed during the benchmark.
|
||||
{
|
||||
ref := gpu.DefaultPowerLimitW
|
||||
if ref <= 0 {
|
||||
ref = gpu.PowerLimitW
|
||||
}
|
||||
measured := gpu.CalibratedPeakPowerW
|
||||
if measured <= 0 {
|
||||
measured = gpu.Steady.AvgPowerW
|
||||
}
|
||||
if measured > 0 && ref > 0 {
|
||||
deviationPct := math.Abs(measured-ref) / ref * 100
|
||||
score.PowerSustainScore = clampScore(100 - deviationPct)
|
||||
}
|
||||
}
|
||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
|
||||
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
|
||||
// StabilityScore: prefer per-precision steady phases where each window runs a
|
||||
// single kernel type so PowerCVPct is a genuine stability signal (not a
|
||||
// workload-mix artifact). Fall back to combined steady using clock-only metrics
|
||||
// when per-precision data is absent (older results, short profiles).
|
||||
// PowerSustainScore: how stable is GPU power draw during the benchmark?
|
||||
// High variance means the workload is bursting or the power delivery is
|
||||
// unstable. Score = max(0, 100 − PowerCVPct × 3).
|
||||
// At 10% CV → score 70; at 33%+ CV → score 0.
|
||||
// Uses per-precision windows when available (each runs a single kernel,
|
||||
// so CV reflects genuine power regulation, not workload switching).
|
||||
if len(gpu.PrecisionSteady) > 0 {
|
||||
var sum float64
|
||||
for _, p := range gpu.PrecisionSteady {
|
||||
sum += clampScore(100 - (p.Steady.ClockCVPct*4 + p.Steady.PowerCVPct*2 + p.Steady.ClockDriftPct*2))
|
||||
sum += clampScore(100 - p.Steady.PowerCVPct*3)
|
||||
}
|
||||
score.StabilityScore = sum / float64(len(gpu.PrecisionSteady))
|
||||
} else {
|
||||
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.ClockDriftPct*2))
|
||||
score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady))
|
||||
} else if gpu.Steady.PowerCVPct > 0 {
|
||||
score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
|
||||
}
|
||||
|
||||
// ThermalSustainScore: how stable is GPU temperature during the benchmark?
|
||||
// High variance means cooling is inconsistent (fan bursts, liquid flow
|
||||
// instability, or frequent transitions in and out of throttle).
|
||||
// Score = max(0, 100 − TempCVPct × 3).
|
||||
if gpu.Steady.TempCVPct > 0 {
|
||||
score.ThermalSustainScore = clampScore(100 - gpu.Steady.TempCVPct*3)
|
||||
} else {
|
||||
// TempCV not recorded — fall back to 100 (no penalty).
|
||||
score.ThermalSustainScore = 100
|
||||
}
|
||||
|
||||
// StabilityScore: what fraction of the benchmark did the GPU spend throttling?
|
||||
// Counts both thermal (HW+SW) and power-cap throttle events.
|
||||
// Score = max(0, 100 − throttle_ratio × 100).
|
||||
// 1% throttle → score 99; 10% throttle → score 90; 100% → score 0.
|
||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||
throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS)
|
||||
score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100)
|
||||
score.CompositeScore = compositeBenchmarkScore(score)
|
||||
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
|
||||
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
|
||||
@@ -1445,20 +1441,18 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
}
|
||||
|
||||
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
|
||||
// Weights after introducing calibrated power reference:
|
||||
// base 0.35 — floor so a GPU that fails all sustain checks still scores
|
||||
// thermal 0.25 — heaviest: throttle counters are the most reliable signal
|
||||
// stability 0.25 — clock/power variance matters for reproducibility
|
||||
// power 0.15 — GPU reaches rated TDP under targeted_power? lower weight
|
||||
// because calibration may be absent (dcgmi not installed)
|
||||
// NCCL bonus 0.10 — interconnect health
|
||||
// cap 1.10
|
||||
quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
|
||||
if score.InterconnectScore > 0 {
|
||||
quality += 0.10
|
||||
}
|
||||
if quality > 1.10 {
|
||||
quality = 1.10
|
||||
// quality_factor weights:
|
||||
// base 0.35 — floor so a GPU that fails all sustain checks still scores
|
||||
// StabilityScore 0.35 — throttle time: heaviest, direct signal of GPU not keeping up
|
||||
// PowerSustainScore 0.15 — power variance: unstable draw hints at regulation issues
|
||||
// ThermalSustainScore 0.15 — temp variance: unstable cooling hints at airflow issues
|
||||
// cap 1.00
|
||||
quality := 0.35 +
|
||||
0.35*(score.StabilityScore/100.0) +
|
||||
0.15*(score.PowerSustainScore/100.0) +
|
||||
0.15*(score.ThermalSustainScore/100.0)
|
||||
if quality > 1.00 {
|
||||
quality = 1.00
|
||||
}
|
||||
return score.ComputeScore * quality
|
||||
}
|
||||
@@ -2547,6 +2541,11 @@ func runBenchmarkPowerCalibration(
|
||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil
|
||||
}
|
||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("power calibration pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
|
||||
canDerate := os.Geteuid() == 0
|
||||
if !canDerate {
|
||||
|
||||
@@ -426,6 +426,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -443,6 +450,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -460,6 +474,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -552,10 +573,16 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
|
||||
// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
|
||||
// controller can cause memtester to spin forever on a single subtest.
|
||||
timeoutSec := sizeMB*passes*150/100 + 120
|
||||
// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
|
||||
// intentionally conservative enough for healthy systems while avoiding the
|
||||
// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
|
||||
timeoutSec := sizeMB*passes*20/100 + 60
|
||||
if timeoutSec < 180 {
|
||||
timeoutSec = 180
|
||||
}
|
||||
if timeoutSec > 900 {
|
||||
timeoutSec = 900
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
|
||||
Reference in New Issue
Block a user