nvvs (DCGM validation suite) survives when dcgmi is killed mid-run, leaving the GPU occupied. The next dcgmi diag invocation then fails with "affected resource is in use". Two-part fix: - Add nvvs and dcgmi to KillTestWorkers patterns so they are cleaned up by the global cancel handler - Call KillTestWorkers at the start of RunNvidiaTargetedStressValidatePack to clear any stale processes before dcgmi diag runs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
69 lines
1.7 KiB
Go
69 lines
1.7 KiB
Go
package platform
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
)
|
|
|
|
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
|
// bee test worker processes that should be killed by KillTestWorkers.
|
|
var workerPatterns = []string{
|
|
"bee-gpu-burn",
|
|
"stress-ng",
|
|
"stressapptest",
|
|
"memtester",
|
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
|
"nvvs",
|
|
"dcgmi",
|
|
}
|
|
|
|
// KilledProcess describes a process that was sent SIGKILL.
|
|
type KilledProcess struct {
|
|
PID int `json:"pid"`
|
|
Name string `json:"name"`
|
|
}
|
|
|
|
// KillTestWorkers scans /proc for running test worker processes and sends
|
|
// SIGKILL to each one found. It returns a list of killed processes.
|
|
// Errors for individual processes (e.g. already exited) are silently ignored.
|
|
func KillTestWorkers() []KilledProcess {
|
|
entries, err := os.ReadDir("/proc")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
var killed []KilledProcess
|
|
for _, e := range entries {
|
|
if !e.IsDir() {
|
|
continue
|
|
}
|
|
pid, err := strconv.Atoi(e.Name())
|
|
if err != nil {
|
|
continue
|
|
}
|
|
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
// /proc/*/cmdline uses NUL bytes as argument separators.
|
|
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
|
exe := strings.TrimSpace(args[0])
|
|
base := exe
|
|
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
|
base = exe[idx+1:]
|
|
}
|
|
for _, pat := range workerPatterns {
|
|
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
|
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return killed
|
|
}
|