Stabilize DCGM GPU discovery

2026-04-03 09:50:33 +03:00
parent 7f6386dccc
commit 7a843be6b0
3 changed files with 76 additions and 6 deletions
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -286,7 +286,25 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
 // gpuIndices: specific GPU indices to test (empty = all GPUs).
 // ctx cancellation kills the running job.
 func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
+	resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
+}
+
+func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
+	if len(gpuIndices) > 0 {
+		return dedupeSortedIndices(gpuIndices), nil
+	}
+	all, err := listNvidiaGPUIndices()
+	if err != nil {
+		return nil, err
+	}
+	if len(all) == 0 {
+		return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
+	}
+	return all, nil
 }

 func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -162,6 +162,39 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
 	}
 }

+func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
+	t.Parallel()
+
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	got, err := resolveDCGMGPUIndices(nil)
+	if err != nil {
+		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
+	}
+	if want := "0,1,2"; joinIndexList(got) != want {
+		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
+	}
+}
+
+func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
+	t.Parallel()
+
+	got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
+	if err != nil {
+		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
+	}
+	if want := "1,3"; joinIndexList(got) != want {
+		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
+	}
+}
+
 func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
 	t.Parallel()

--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true
 log "ldconfig refreshed"

 # Start DCGM host engine so dcgmi can discover GPUs.
-# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
-# "group is empty" even when GPUs and modules are present.
-# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
+# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
+# If it started too early (for example via systemd before bee-nvidia-load), it can
+# keep a stale empty inventory and dcgmi diag later reports no testable entities.
 if command -v nv-hostengine >/dev/null 2>&1; then
    if pgrep -x nv-hostengine >/dev/null 2>&1; then
-        log "nv-hostengine already running — skipping"
-    else
+        if command -v pkill >/dev/null 2>&1; then
+            pkill -x nv-hostengine >/dev/null 2>&1 || true
+            tries=0
+            while pgrep -x nv-hostengine >/dev/null 2>&1; do
+                tries=$((tries + 1))
+                if [ "${tries}" -ge 10 ]; then
+                    log "WARN: nv-hostengine is still running after restart request"
+                    break
+                fi
+                sleep 1
+            done
+            if pgrep -x nv-hostengine >/dev/null 2>&1; then
+                log "WARN: keeping existing nv-hostengine process"
+            else
+                log "nv-hostengine restarted"
+            fi
+        else
+            log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
+        fi
+    fi
+    if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        nv-hostengine
        log "nv-hostengine started"
    fi