From 1bdfb1e9ca38163381f2e49ee0d4cc6259b39a21 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 5 Apr 2026 20:21:36 +0300 Subject: [PATCH] Fix nvidia-targeted-stress failing with DCGM_ST_IN_USE (-34) nvvs (DCGM validation suite) survives when dcgmi is killed mid-run, leaving the GPU occupied. The next dcgmi diag invocation then fails with "affected resource is in use". Two-part fix: - Add nvvs and dcgmi to KillTestWorkers patterns so they are cleaned up by the global cancel handler - Call KillTestWorkers at the start of RunNvidiaTargetedStressValidatePack to clear any stale processes before dcgmi diag runs Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/kill_workers.go | 4 ++++ audit/internal/platform/sat.go | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/audit/internal/platform/kill_workers.go b/audit/internal/platform/kill_workers.go index ce0f65c..09153f1 100644 --- a/audit/internal/platform/kill_workers.go +++ b/audit/internal/platform/kill_workers.go @@ -15,6 +15,10 @@ var workerPatterns = []string{ "stress-ng", "stressapptest", "memtester", + // DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives + // if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE). + "nvvs", + "dcgmi", } // KilledProcess describes a process that was sent SIGKILL. diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 7aacf3f..29c4803 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -382,6 +382,13 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi if err != nil { return "", err } + // Kill any lingering nvvs/dcgmi processes from a previous interrupted run + // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34). + if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil { + for _, p := range killed { + logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) + } + } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{ {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {