diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 986b7f5..a11466a 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -286,7 +286,25 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) ( // gpuIndices: specific GPU indices to test (empty = all GPUs). // ctx cancellation kills the running job. func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) { - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc) + resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices) + if err != nil { + return "", err + } + return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc) +} + +func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) { + if len(gpuIndices) > 0 { + return dedupeSortedIndices(gpuIndices), nil + } + all, err := listNvidiaGPUIndices() + if err != nil { + return nil, err + } + if len(all) == 0 { + return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs") + } + return all, nil } func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index e2bf7d2..5fb033f 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -162,6 +162,39 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) { } } +func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) { + t.Parallel() + + oldExecCommand := satExecCommand + satExecCommand = func(name string, args ...string) *exec.Cmd { + if name == "nvidia-smi" { + return exec.Command("sh", "-c", "printf '2\n0\n1\n'") + } + return exec.Command(name, args...) + } + t.Cleanup(func() { satExecCommand = oldExecCommand }) + + got, err := resolveDCGMGPUIndices(nil) + if err != nil { + t.Fatalf("resolveDCGMGPUIndices error: %v", err) + } + if want := "0,1,2"; joinIndexList(got) != want { + t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want) + } +} + +func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) { + t.Parallel() + + got, err := resolveDCGMGPUIndices([]int{3, 1, 3}) + if err != nil { + t.Fatalf("resolveDCGMGPUIndices error: %v", err) + } + if want := "1,3"; joinIndexList(got) != want { + t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want) + } +} + func TestNvidiaStressArchivePrefixByLoader(t *testing.T) { t.Parallel() diff --git a/iso/overlay/usr/local/bin/bee-nvidia-load b/iso/overlay/usr/local/bin/bee-nvidia-load index bf32303..022123f 100755 --- a/iso/overlay/usr/local/bin/bee-nvidia-load +++ b/iso/overlay/usr/local/bin/bee-nvidia-load @@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true log "ldconfig refreshed" # Start DCGM host engine so dcgmi can discover GPUs. -# nv-hostengine must run before any dcgmi command — without it, dcgmi reports -# "group is empty" even when GPUs and modules are present. -# Skip if already running (e.g. started by a dcgm systemd service or prior boot). +# nv-hostengine must run after the NVIDIA modules and device nodes are ready. +# If it started too early (for example via systemd before bee-nvidia-load), it can +# keep a stale empty inventory and dcgmi diag later reports no testable entities. if command -v nv-hostengine >/dev/null 2>&1; then if pgrep -x nv-hostengine >/dev/null 2>&1; then - log "nv-hostengine already running — skipping" - else + if command -v pkill >/dev/null 2>&1; then + pkill -x nv-hostengine >/dev/null 2>&1 || true + tries=0 + while pgrep -x nv-hostengine >/dev/null 2>&1; do + tries=$((tries + 1)) + if [ "${tries}" -ge 10 ]; then + log "WARN: nv-hostengine is still running after restart request" + break + fi + sleep 1 + done + if pgrep -x nv-hostengine >/dev/null 2>&1; then + log "WARN: keeping existing nv-hostengine process" + else + log "nv-hostengine restarted" + fi + else + log "WARN: pkill not found — cannot refresh nv-hostengine inventory" + fi + fi + if ! pgrep -x nv-hostengine >/dev/null 2>&1; then nv-hostengine log "nv-hostengine started" fi