diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 055178e..12aeba6 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -1399,44 +1399,40 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { case score.MixedScore > 0: score.ComputeScore = score.MixedScore } - // PowerSustainScore: measures how close the GPU came to its rated TDP during - // steady-state benchmark load. 100 = exactly at rated TDP. - // Penalty applied symmetrically for both under- and over-TDP deviations: - // score = max(0, 100 − |measured − rated| / rated × 100) - // Under-TDP → power delivery / cooling issue. - // Over-TDP → power limit not properly enforced / power regulation fault. - // Uses CalibratedPeakPowerW when available (from external power calibration), - // otherwise falls back to Steady.AvgPowerW observed during the benchmark. - { - ref := gpu.DefaultPowerLimitW - if ref <= 0 { - ref = gpu.PowerLimitW - } - measured := gpu.CalibratedPeakPowerW - if measured <= 0 { - measured = gpu.Steady.AvgPowerW - } - if measured > 0 && ref > 0 { - deviationPct := math.Abs(measured-ref) / ref * 100 - score.PowerSustainScore = clampScore(100 - deviationPct) - } - } - runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) - thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS - score.ThermalSustainScore = clampScore(100 - thermalRatio*100) - // StabilityScore: prefer per-precision steady phases where each window runs a - // single kernel type so PowerCVPct is a genuine stability signal (not a - // workload-mix artifact). Fall back to combined steady using clock-only metrics - // when per-precision data is absent (older results, short profiles). + // PowerSustainScore: how stable is GPU power draw during the benchmark? + // High variance means the workload is bursting or the power delivery is + // unstable. Score = max(0, 100 − PowerCVPct × 3). + // At 10% CV → score 70; at 33%+ CV → score 0. + // Uses per-precision windows when available (each runs a single kernel, + // so CV reflects genuine power regulation, not workload switching). if len(gpu.PrecisionSteady) > 0 { var sum float64 for _, p := range gpu.PrecisionSteady { - sum += clampScore(100 - (p.Steady.ClockCVPct*4 + p.Steady.PowerCVPct*2 + p.Steady.ClockDriftPct*2)) + sum += clampScore(100 - p.Steady.PowerCVPct*3) } - score.StabilityScore = sum / float64(len(gpu.PrecisionSteady)) - } else { - score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.ClockDriftPct*2)) + score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady)) + } else if gpu.Steady.PowerCVPct > 0 { + score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3) } + + // ThermalSustainScore: how stable is GPU temperature during the benchmark? + // High variance means cooling is inconsistent (fan bursts, liquid flow + // instability, or frequent transitions in and out of throttle). + // Score = max(0, 100 − TempCVPct × 3). + if gpu.Steady.TempCVPct > 0 { + score.ThermalSustainScore = clampScore(100 - gpu.Steady.TempCVPct*3) + } else { + // TempCV not recorded — fall back to 100 (no penalty). + score.ThermalSustainScore = 100 + } + + // StabilityScore: what fraction of the benchmark did the GPU spend throttling? + // Counts both thermal (HW+SW) and power-cap throttle events. + // Score = max(0, 100 − throttle_ratio × 100). + // 1% throttle → score 99; 10% throttle → score 90; 100% → score 0. + runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) + throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS) + score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100) score.CompositeScore = compositeBenchmarkScore(score) if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 { score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0) @@ -1445,20 +1441,18 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { } func compositeBenchmarkScore(score BenchmarkScorecard) float64 { - // Weights after introducing calibrated power reference: - // base 0.35 — floor so a GPU that fails all sustain checks still scores - // thermal 0.25 — heaviest: throttle counters are the most reliable signal - // stability 0.25 — clock/power variance matters for reproducibility - // power 0.15 — GPU reaches rated TDP under targeted_power? lower weight - // because calibration may be absent (dcgmi not installed) - // NCCL bonus 0.10 — interconnect health - // cap 1.10 - quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0) - if score.InterconnectScore > 0 { - quality += 0.10 - } - if quality > 1.10 { - quality = 1.10 + // quality_factor weights: + // base 0.35 — floor so a GPU that fails all sustain checks still scores + // StabilityScore 0.35 — throttle time: heaviest, direct signal of GPU not keeping up + // PowerSustainScore 0.15 — power variance: unstable draw hints at regulation issues + // ThermalSustainScore 0.15 — temp variance: unstable cooling hints at airflow issues + // cap 1.00 + quality := 0.35 + + 0.35*(score.StabilityScore/100.0) + + 0.15*(score.PowerSustainScore/100.0) + + 0.15*(score.ThermalSustainScore/100.0) + if quality > 1.00 { + quality = 1.00 } return score.ComputeScore * quality } @@ -2547,6 +2541,11 @@ func runBenchmarkPowerCalibration( logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") return map[int]benchmarkPowerCalibrationResult{}, nil } + if killed := KillTestWorkers(); len(killed) > 0 { + for _, p := range killed { + logFunc(fmt.Sprintf("power calibration pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) + } + } canDerate := os.Geteuid() == 0 if !canDerate { diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 2be426b..ac8bcc8 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -426,6 +426,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, if err != nil { return "", err } + // Kill any lingering nvvs/dcgmi processes from a previous interrupted run + // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34). + if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil { + for _, p := range killed { + logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) + } + } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{ @@ -443,6 +450,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur if err != nil { return "", err } + // Kill any lingering nvvs/dcgmi processes from a previous interrupted run + // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34). + if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil { + for _, p := range killed { + logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) + } + } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{ @@ -460,6 +474,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu if err != nil { return "", err } + // Kill any lingering nvvs/dcgmi processes from a previous interrupted run + // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34). + if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil { + for _, p := range killed { + logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) + } + } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{ @@ -552,10 +573,16 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si if passes <= 0 { passes = 1 } - // Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per - // pass, plus a fixed 2-minute buffer. Without this, a stuck memory - // controller can cause memtester to spin forever on a single subtest. - timeoutSec := sizeMB*passes*150/100 + 120 + // Keep Validate Memory bounded to a quick diagnostic window. The timeout is + // intentionally conservative enough for healthy systems while avoiding the + // prior 30-80 minute hangs caused by memtester spinning on a bad subtest. + timeoutSec := sizeMB*passes*20/100 + 60 + if timeoutSec < 180 { + timeoutSec = 180 + } + if timeoutSec > 900 { + timeoutSec = 900 + } return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}}, diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 75bf3ca..3a9f259 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -737,6 +737,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) { if t.job != nil { t.job.abort() } + if taskMayLeaveOrphanWorkers(t.Target) { + platform.KillTestWorkers() + } t.Status = TaskCancelled now := time.Now() t.DoneAt = &now diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 173796c..740f036 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1339,7 +1339,7 @@ func renderValidate(opts HandlerOptions) string { inv.Memory, `Runs a RAM validation pass and records memory state around the test.`, `free, memtester`, - `256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`, + `256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`, )) + renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( inv.Storage, diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index e8f9928..ff9e32e 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -162,6 +162,32 @@ type nvidiaRampSpec struct { TotalDurationSec int } +func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) { + switch strings.TrimSpace(strings.ToLower(profile)) { + case "overnight": + return 1024, 2 + case "acceptance": + return 1024, 1 + case "smoke": + return 256, 1 + } + if stress { + return 512, 1 + } + return 256, 1 +} + +func taskMayLeaveOrphanWorkers(target string) bool { + switch strings.TrimSpace(strings.ToLower(target)) { + case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse", + "nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf", + "memory", "memory-stress", "cpu", "sat-stress", "platform-stress": + return true + default: + return false + } +} + func resolveBurnPreset(profile string) burnPreset { switch profile { case "overnight": @@ -751,10 +777,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { err = fmt.Errorf("app not configured") break } - sizeMB, passes := 256, 1 - if t.params.StressMode { - sizeMB, passes = 1024, 3 - } + sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode) + j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes)) archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append) case "storage": if a == nil { @@ -1010,6 +1034,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request if t.job != nil { t.job.abort() } + if taskMayLeaveOrphanWorkers(t.Target) { + platform.KillTestWorkers() + } t.Status = TaskCancelled t.DoneAt = &now taskSerialEvent(t, "finished with status="+t.Status) @@ -1037,6 +1064,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque if t.job != nil { t.job.abort() } + if taskMayLeaveOrphanWorkers(t.Target) { + platform.KillTestWorkers() + } t.Status = TaskCancelled t.DoneAt = &now taskSerialEvent(t, "finished with status="+t.Status) @@ -1141,10 +1171,13 @@ func (q *taskQueue) loadLocked() { q.assignTaskLogPathLocked(t) if t.Status == TaskRunning { // The task was interrupted by a bee-web restart. Child processes - // (e.g. bee-gpu-burn-worker) survive the restart in their own - // process groups and cannot be cancelled retroactively. Mark the - // task as failed so the user can decide whether to re-run it - // rather than blindly re-launching duplicate workers. + // (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in + // their own process groups. Kill any matching stale workers before + // marking the task failed so the next GPU test does not inherit a + // busy DCGM slot or duplicate workers. + if taskMayLeaveOrphanWorkers(t.Target) { + _ = platform.KillTestWorkers() + } now := time.Now() t.Status = TaskFailed t.DoneAt = &now diff --git a/audit/internal/webui/tasks_test.go b/audit/internal/webui/tasks_test.go index 5f39830..4ad5c03 100644 --- a/audit/internal/webui/tasks_test.go +++ b/audit/internal/webui/tasks_test.go @@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) { } } +func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) { + var gotSizeMB, gotPasses int + q := &taskQueue{ + opts: &HandlerOptions{App: &app.App{}}, + } + tk := &Task{ + ID: "mem-validate-1", + Name: "Memory SAT", + Target: "memory", + Status: TaskRunning, + CreatedAt: time.Now(), + params: taskParams{StressMode: true}, + } + j := &jobState{} + + orig := runMemoryAcceptancePackCtx + runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) { + gotSizeMB = sizeMB + gotPasses = passes + return "/tmp/memory-validate.tar.gz", nil + } + defer func() { runMemoryAcceptancePackCtx = orig }() + + q.runTask(tk, j, context.Background()) + + if gotSizeMB != 512 || gotPasses != 1 { + t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses) + } +} + func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) { dir := t.TempDir() q := &taskQueue{