diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go
index 055178e..12aeba6 100644
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -1399,44 +1399,40 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
case score.MixedScore > 0:
score.ComputeScore = score.MixedScore
}
- // PowerSustainScore: measures how close the GPU came to its rated TDP during
- // steady-state benchmark load. 100 = exactly at rated TDP.
- // Penalty applied symmetrically for both under- and over-TDP deviations:
- // score = max(0, 100 − |measured − rated| / rated × 100)
- // Under-TDP → power delivery / cooling issue.
- // Over-TDP → power limit not properly enforced / power regulation fault.
- // Uses CalibratedPeakPowerW when available (from external power calibration),
- // otherwise falls back to Steady.AvgPowerW observed during the benchmark.
- {
- ref := gpu.DefaultPowerLimitW
- if ref <= 0 {
- ref = gpu.PowerLimitW
- }
- measured := gpu.CalibratedPeakPowerW
- if measured <= 0 {
- measured = gpu.Steady.AvgPowerW
- }
- if measured > 0 && ref > 0 {
- deviationPct := math.Abs(measured-ref) / ref * 100
- score.PowerSustainScore = clampScore(100 - deviationPct)
- }
- }
- runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
- thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
- score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
- // StabilityScore: prefer per-precision steady phases where each window runs a
- // single kernel type so PowerCVPct is a genuine stability signal (not a
- // workload-mix artifact). Fall back to combined steady using clock-only metrics
- // when per-precision data is absent (older results, short profiles).
+ // PowerSustainScore: how stable is GPU power draw during the benchmark?
+ // High variance means the workload is bursting or the power delivery is
+ // unstable. Score = max(0, 100 − PowerCVPct × 3).
+ // At 10% CV → score 70; at 33%+ CV → score 0.
+ // Uses per-precision windows when available (each runs a single kernel,
+ // so CV reflects genuine power regulation, not workload switching).
if len(gpu.PrecisionSteady) > 0 {
var sum float64
for _, p := range gpu.PrecisionSteady {
- sum += clampScore(100 - (p.Steady.ClockCVPct*4 + p.Steady.PowerCVPct*2 + p.Steady.ClockDriftPct*2))
+ sum += clampScore(100 - p.Steady.PowerCVPct*3)
}
- score.StabilityScore = sum / float64(len(gpu.PrecisionSteady))
- } else {
- score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.ClockDriftPct*2))
+ score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady))
+ } else if gpu.Steady.PowerCVPct > 0 {
+ score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
}
+
+ // ThermalSustainScore: how stable is GPU temperature during the benchmark?
+ // High variance means cooling is inconsistent (fan bursts, liquid flow
+ // instability, or frequent transitions in and out of throttle).
+ // Score = max(0, 100 − TempCVPct × 3).
+ if gpu.Steady.TempCVPct > 0 {
+ score.ThermalSustainScore = clampScore(100 - gpu.Steady.TempCVPct*3)
+ } else {
+ // TempCV not recorded — fall back to 100 (no penalty).
+ score.ThermalSustainScore = 100
+ }
+
+ // StabilityScore: what fraction of the benchmark did the GPU spend throttling?
+ // Counts both thermal (HW+SW) and power-cap throttle events.
+ // Score = max(0, 100 − throttle_ratio × 100).
+ // 1% throttle → score 99; 10% throttle → score 90; 100% → score 0.
+ runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
+ throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS)
+ score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100)
score.CompositeScore = compositeBenchmarkScore(score)
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
@@ -1445,20 +1441,18 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
}
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
- // Weights after introducing calibrated power reference:
- // base 0.35 — floor so a GPU that fails all sustain checks still scores
- // thermal 0.25 — heaviest: throttle counters are the most reliable signal
- // stability 0.25 — clock/power variance matters for reproducibility
- // power 0.15 — GPU reaches rated TDP under targeted_power? lower weight
- // because calibration may be absent (dcgmi not installed)
- // NCCL bonus 0.10 — interconnect health
- // cap 1.10
- quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
- if score.InterconnectScore > 0 {
- quality += 0.10
- }
- if quality > 1.10 {
- quality = 1.10
+ // quality_factor weights:
+ // base 0.35 — floor so a GPU that fails all sustain checks still scores
+ // StabilityScore 0.35 — throttle time: heaviest, direct signal of GPU not keeping up
+ // PowerSustainScore 0.15 — power variance: unstable draw hints at regulation issues
+ // ThermalSustainScore 0.15 — temp variance: unstable cooling hints at airflow issues
+ // cap 1.00
+ quality := 0.35 +
+ 0.35*(score.StabilityScore/100.0) +
+ 0.15*(score.PowerSustainScore/100.0) +
+ 0.15*(score.ThermalSustainScore/100.0)
+ if quality > 1.00 {
+ quality = 1.00
}
return score.ComputeScore * quality
}
@@ -2547,6 +2541,11 @@ func runBenchmarkPowerCalibration(
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil
}
+ if killed := KillTestWorkers(); len(killed) > 0 {
+ for _, p := range killed {
+ logFunc(fmt.Sprintf("power calibration pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+ }
+ }
canDerate := os.Geteuid() == 0
if !canDerate {
diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go
index 2be426b..ac8bcc8 100644
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -426,6 +426,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
if err != nil {
return "", err
}
+ // Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+ // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+ if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+ for _, p := range killed {
+ logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+ }
+ }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{
@@ -443,6 +450,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
if err != nil {
return "", err
}
+ // Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+ // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+ if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+ for _, p := range killed {
+ logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+ }
+ }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{
@@ -460,6 +474,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
if err != nil {
return "", err
}
+ // Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+ // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+ if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+ for _, p := range killed {
+ logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+ }
+ }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{
@@ -552,10 +573,16 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
if passes <= 0 {
passes = 1
}
- // Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
- // pass, plus a fixed 2-minute buffer. Without this, a stuck memory
- // controller can cause memtester to spin forever on a single subtest.
- timeoutSec := sizeMB*passes*150/100 + 120
+ // Keep Validate Memory bounded to a quick diagnostic window. The timeout is
+ // intentionally conservative enough for healthy systems while avoiding the
+ // prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
+ timeoutSec := sizeMB*passes*20/100 + 60
+ if timeoutSec < 180 {
+ timeoutSec = 180
+ }
+ if timeoutSec > 900 {
+ timeoutSec = 900
+ }
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go
index 75bf3ca..3a9f259 100644
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -737,6 +737,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
if t.job != nil {
t.job.abort()
}
+ if taskMayLeaveOrphanWorkers(t.Target) {
+ platform.KillTestWorkers()
+ }
t.Status = TaskCancelled
now := time.Now()
t.DoneAt = &now
diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go
index 173796c..740f036 100644
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1339,7 +1339,7 @@ func renderValidate(opts HandlerOptions) string {
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`free, memtester`,
- `256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
+ `256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go
index e8f9928..ff9e32e 100644
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -162,6 +162,32 @@ type nvidiaRampSpec struct {
TotalDurationSec int
}
+func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
+ switch strings.TrimSpace(strings.ToLower(profile)) {
+ case "overnight":
+ return 1024, 2
+ case "acceptance":
+ return 1024, 1
+ case "smoke":
+ return 256, 1
+ }
+ if stress {
+ return 512, 1
+ }
+ return 256, 1
+}
+
+func taskMayLeaveOrphanWorkers(target string) bool {
+ switch strings.TrimSpace(strings.ToLower(target)) {
+ case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
+ "nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
+ "memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
+ return true
+ default:
+ return false
+ }
+}
+
func resolveBurnPreset(profile string) burnPreset {
switch profile {
case "overnight":
@@ -751,10 +777,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured")
break
}
- sizeMB, passes := 256, 1
- if t.params.StressMode {
- sizeMB, passes = 1024, 3
- }
+ sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+ j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
case "storage":
if a == nil {
@@ -1010,6 +1034,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
if t.job != nil {
t.job.abort()
}
+ if taskMayLeaveOrphanWorkers(t.Target) {
+ platform.KillTestWorkers()
+ }
t.Status = TaskCancelled
t.DoneAt = &now
taskSerialEvent(t, "finished with status="+t.Status)
@@ -1037,6 +1064,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
if t.job != nil {
t.job.abort()
}
+ if taskMayLeaveOrphanWorkers(t.Target) {
+ platform.KillTestWorkers()
+ }
t.Status = TaskCancelled
t.DoneAt = &now
taskSerialEvent(t, "finished with status="+t.Status)
@@ -1141,10 +1171,13 @@ func (q *taskQueue) loadLocked() {
q.assignTaskLogPathLocked(t)
if t.Status == TaskRunning {
// The task was interrupted by a bee-web restart. Child processes
- // (e.g. bee-gpu-burn-worker) survive the restart in their own
- // process groups and cannot be cancelled retroactively. Mark the
- // task as failed so the user can decide whether to re-run it
- // rather than blindly re-launching duplicate workers.
+ // (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
+ // their own process groups. Kill any matching stale workers before
+ // marking the task failed so the next GPU test does not inherit a
+ // busy DCGM slot or duplicate workers.
+ if taskMayLeaveOrphanWorkers(t.Target) {
+ _ = platform.KillTestWorkers()
+ }
now := time.Now()
t.Status = TaskFailed
t.DoneAt = &now
diff --git a/audit/internal/webui/tasks_test.go b/audit/internal/webui/tasks_test.go
index 5f39830..4ad5c03 100644
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
}
}
+func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
+ var gotSizeMB, gotPasses int
+ q := &taskQueue{
+ opts: &HandlerOptions{App: &app.App{}},
+ }
+ tk := &Task{
+ ID: "mem-validate-1",
+ Name: "Memory SAT",
+ Target: "memory",
+ Status: TaskRunning,
+ CreatedAt: time.Now(),
+ params: taskParams{StressMode: true},
+ }
+ j := &jobState{}
+
+ orig := runMemoryAcceptancePackCtx
+ runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
+ gotSizeMB = sizeMB
+ gotPasses = passes
+ return "/tmp/memory-validate.tar.gz", nil
+ }
+ defer func() { runMemoryAcceptancePackCtx = orig }()
+
+ q.runTask(tk, j, context.Background())
+
+ if gotSizeMB != 512 || gotPasses != 1 {
+ t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
+ }
+}
+
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
dir := t.TempDir()
q := &taskQueue{