Fix nvidia-targeted-stress failing with DCGM_ST_IN_USE (-34)
nvvs (DCGM validation suite) survives when dcgmi is killed mid-run, leaving the GPU occupied. The next dcgmi diag invocation then fails with "affected resource is in use". Two-part fix: - Add nvvs and dcgmi to KillTestWorkers patterns so they are cleaned up by the global cancel handler - Call KillTestWorkers at the start of RunNvidiaTargetedStressValidatePack to clear any stale processes before dcgmi diag runs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,10 @@ var workerPatterns = []string{
|
|||||||
"stress-ng",
|
"stress-ng",
|
||||||
"stressapptest",
|
"stressapptest",
|
||||||
"memtester",
|
"memtester",
|
||||||
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||||
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||||
|
"nvvs",
|
||||||
|
"dcgmi",
|
||||||
}
|
}
|
||||||
|
|
||||||
// KilledProcess describes a process that was sent SIGKILL.
|
// KilledProcess describes a process that was sent SIGKILL.
|
||||||
|
|||||||
@@ -382,6 +382,13 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user