Stabilize DCGM GPU discovery

This commit is contained in:
Mikhail Chusavitin
2026-04-03 09:50:33 +03:00
parent 7f6386dccc
commit 7a843be6b0
3 changed files with 76 additions and 6 deletions

View File

@@ -286,7 +286,25 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
// gpuIndices: specific GPU indices to test (empty = all GPUs).
// ctx cancellation kills the running job.
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
}
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
if len(gpuIndices) > 0 {
return dedupeSortedIndices(gpuIndices), nil
}
all, err := listNvidiaGPUIndices()
if err != nil {
return nil, err
}
if len(all) == 0 {
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
}
return all, nil
}
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {

View File

@@ -162,6 +162,39 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
}
}
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
t.Parallel()
oldExecCommand := satExecCommand
satExecCommand = func(name string, args ...string) *exec.Cmd {
if name == "nvidia-smi" {
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
}
return exec.Command(name, args...)
}
t.Cleanup(func() { satExecCommand = oldExecCommand })
got, err := resolveDCGMGPUIndices(nil)
if err != nil {
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
}
if want := "0,1,2"; joinIndexList(got) != want {
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
}
}
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
t.Parallel()
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
if err != nil {
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
}
if want := "1,3"; joinIndexList(got) != want {
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
}
}
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
t.Parallel()

View File

@@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true
log "ldconfig refreshed"
# Start DCGM host engine so dcgmi can discover GPUs.
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
# "group is empty" even when GPUs and modules are present.
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
# If it started too early (for example via systemd before bee-nvidia-load), it can
# keep a stale empty inventory and dcgmi diag later reports no testable entities.
if command -v nv-hostengine >/dev/null 2>&1; then
if pgrep -x nv-hostengine >/dev/null 2>&1; then
log "nv-hostengine already running — skipping"
else
if command -v pkill >/dev/null 2>&1; then
pkill -x nv-hostengine >/dev/null 2>&1 || true
tries=0
while pgrep -x nv-hostengine >/dev/null 2>&1; do
tries=$((tries + 1))
if [ "${tries}" -ge 10 ]; then
log "WARN: nv-hostengine is still running after restart request"
break
fi
sleep 1
done
if pgrep -x nv-hostengine >/dev/null 2>&1; then
log "WARN: keeping existing nv-hostengine process"
else
log "nv-hostengine restarted"
fi
else
log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
fi
fi
if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
nv-hostengine
log "nv-hostengine started"
fi