diff --git a/audit/internal/platform/kill_workers.go b/audit/internal/platform/kill_workers.go index ce0f65c..09153f1 100644 --- a/audit/internal/platform/kill_workers.go +++ b/audit/internal/platform/kill_workers.go @@ -15,6 +15,10 @@ var workerPatterns = []string{ "stress-ng", "stressapptest", "memtester", + // DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives + // if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE). + "nvvs", + "dcgmi", } // KilledProcess describes a process that was sent SIGKILL. diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 7aacf3f..29c4803 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -382,6 +382,13 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi if err != nil { return "", err } + // Kill any lingering nvvs/dcgmi processes from a previous interrupted run + // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34). + if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil { + for _, p := range killed { + logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) + } + } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{ {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {