From 749fc8a94d6f6fa459a70c970b0130b13e83a7f5 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 23 Apr 2026 20:31:41 +0300 Subject: [PATCH] Unify NVIDIA GPU recovery paths --- audit/internal/platform/benchmark.go | 36 ++- audit/internal/platform/benchmark_test.go | 70 +++--- audit/internal/platform/nvidia_recover.go | 21 ++ audit/internal/platform/sat.go | 9 +- audit/internal/platform/services.go | 2 +- iso/overlay/usr/local/bin/bee-nvidia-recover | 222 +++++++++++++++---- 6 files changed, 278 insertions(+), 82 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index e13cd8a..840e22c 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -105,6 +105,7 @@ var ( benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`) benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`) benchmarkGeteuid = os.Geteuid + benchmarkResetNvidiaGPU = resetNvidiaGPU benchmarkSleep = time.Sleep ) @@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po return nil } +func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error { + if logFunc != nil { + logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex)) + } + out, err := benchmarkResetNvidiaGPU(gpuIndex) + appendSATVerboseLog(verboseLog, + fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex), + "cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex), + ) + if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil { + for _, line := range strings.Split(trimmed, "\n") { + line = strings.TrimSpace(line) + if line != "" { + logFunc(line) + } + } + } + rc := 0 + if err != nil { + rc = 1 + } + appendSATVerboseLog(verboseLog, + fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex), + fmt.Sprintf("rc: %d", rc), + "", + ) + return err +} + func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int { if len(gpuIndices) == 0 { return nil @@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int } var failed []int for _, idx := range gpuIndices { - name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx) - if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil { + if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil { failed = append(failed, idx) if logFunc != nil { logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err)) @@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N _ = os.MkdirAll(singleDir, 0755) singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex) if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 { - result.Findings = append(result.Findings, - fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx)) + return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx) } logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) singlePowerStopCh := make(chan struct{}) diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index 07f17c2..389a38c 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -2,7 +2,7 @@ package platform import ( "context" - "os" + "fmt" "os/exec" "path/filepath" "strings" @@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) { } func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) { - t.Parallel() - oldGeteuid := benchmarkGeteuid - oldExec := satExecCommand + oldReset := benchmarkResetNvidiaGPU benchmarkGeteuid = func() int { return 1000 } - satExecCommand = func(name string, args ...string) *exec.Cmd { - t.Fatalf("unexpected command: %s %v", name, args) - return nil + benchmarkResetNvidiaGPU = func(int) (string, error) { + t.Fatal("unexpected reset call") + return "", nil } t.Cleanup(func() { benchmarkGeteuid = oldGeteuid - satExecCommand = oldExec + benchmarkResetNvidiaGPU = oldReset }) var logs []string @@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) { } func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) { - t.Parallel() - - dir := t.TempDir() - script := filepath.Join(dir, "nvidia-smi") - argsLog := filepath.Join(dir, "args.log") - if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil { - t.Fatalf("write script: %v", err) - } - oldGeteuid := benchmarkGeteuid oldSleep := benchmarkSleep - oldLookPath := satLookPath + oldReset := benchmarkResetNvidiaGPU benchmarkGeteuid = func() int { return 0 } benchmarkSleep = func(time.Duration) {} - satLookPath = func(file string) (string, error) { - if file == "nvidia-smi" { - return script, nil - } - return exec.LookPath(file) + var calls []int + benchmarkResetNvidiaGPU = func(index int) (string, error) { + calls = append(calls, index) + return "ok\n", nil } t.Cleanup(func() { benchmarkGeteuid = oldGeteuid benchmarkSleep = oldSleep - satLookPath = oldLookPath + benchmarkResetNvidiaGPU = oldReset }) - failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil) + failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil) if len(failed) != 0 { t.Fatalf("failed=%v want no failures", failed) } - raw, err := os.ReadFile(argsLog) - if err != nil { - t.Fatalf("read args log: %v", err) + if got, want := fmt.Sprint(calls), "[2 5]"; got != want { + t.Fatalf("calls=%v want %s", calls, want) } - got := strings.Fields(string(raw)) - want := []string{"-i", "2", "-r", "-i", "5", "-r"} - if strings.Join(got, " ") != strings.Join(want, " ") { - t.Fatalf("args=%v want %v", got, want) +} + +func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) { + oldGeteuid := benchmarkGeteuid + oldSleep := benchmarkSleep + oldReset := benchmarkResetNvidiaGPU + benchmarkGeteuid = func() int { return 0 } + benchmarkSleep = func(time.Duration) {} + benchmarkResetNvidiaGPU = func(index int) (string, error) { + if index == 5 { + return "busy\n", exec.ErrNotFound + } + return "ok\n", nil + } + t.Cleanup(func() { + benchmarkGeteuid = oldGeteuid + benchmarkSleep = oldSleep + benchmarkResetNvidiaGPU = oldReset + }) + + failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil) + if got, want := fmt.Sprint(failed), "[5]"; got != want { + t.Fatalf("failed=%v want %s", failed, want) } } diff --git a/audit/internal/platform/nvidia_recover.go b/audit/internal/platform/nvidia_recover.go index 50cad93..b922130 100644 --- a/audit/internal/platform/nvidia_recover.go +++ b/audit/internal/platform/nvidia_recover.go @@ -3,6 +3,8 @@ package platform import ( "fmt" "os/exec" + "strconv" + "strings" "time" ) @@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) { raw, err := exec.Command("sudo", helperArgs...).CombinedOutput() return string(raw), err } + +func resetNvidiaGPU(index int) (string, error) { + if index < 0 { + return "", fmt.Errorf("gpu index must be >= 0") + } + out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index)) + if strings.TrimSpace(out) == "" && err == nil { + out = "GPU reset completed.\n" + } + return out, err +} + +func restartNvidiaDrivers() (string, error) { + out, err := runNvidiaRecover("restart-drivers") + if strings.TrimSpace(out) == "" && err == nil { + out = "NVIDIA drivers restarted.\n" + } + return out, err +} diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 88e844a..35a8594 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string { } func (s *System) ResetNvidiaGPU(index int) (string, error) { - if index < 0 { - return "", fmt.Errorf("gpu index must be >= 0") - } - out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index)) - if strings.TrimSpace(out) == "" && err == nil { - out = "GPU reset completed.\n" - } - return out, err + return resetNvidiaGPU(index) } // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs. diff --git a/audit/internal/platform/services.go b/audit/internal/platform/services.go index a234e17..d89e9a9 100644 --- a/audit/internal/platform/services.go +++ b/audit/internal/platform/services.go @@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string { func (s *System) ServiceDo(name string, action ServiceAction) (string, error) { if name == "bee-nvidia" && action == ServiceRestart { - return runNvidiaRecover("restart-drivers") + return restartNvidiaDrivers() } // bee-web runs as the bee user; sudo is required to control system services. // /etc/sudoers.d/bee grants bee NOPASSWD:ALL. diff --git a/iso/overlay/usr/local/bin/bee-nvidia-recover b/iso/overlay/usr/local/bin/bee-nvidia-recover index 73cd5db..74596c3 100755 --- a/iso/overlay/usr/local/bin/bee-nvidia-recover +++ b/iso/overlay/usr/local/bin/bee-nvidia-recover @@ -60,35 +60,129 @@ wait_for_process_exit() { return 0 } -kill_pattern() { - pattern="$1" - if pgrep -f "$pattern" >/dev/null 2>&1; then - pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do +log_pid_details() { + pid="$1" + line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//') + if [ -n "$line" ]; then + log_blocker "$line" + else + log_blocker "pid $pid" + fi +} + +collect_gpu_compute_pids() { + index="$1" + if ! command -v nvidia-smi >/dev/null 2>&1; then + return 0 + fi + nvidia-smi --id="$index" \ + --query-compute-apps=pid \ + --format=csv,noheader,nounits 2>/dev/null \ + | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \ + | grep -E '^[0-9]+$' || true +} + +collect_gpu_device_pids() { + index="$1" + dev="/dev/nvidia$index" + [ -e "$dev" ] || return 0 + if command -v fuser >/dev/null 2>&1; then + fuser "$dev" 2>/dev/null \ + | tr ' ' '\n' \ + | sed 's/[^0-9].*$//' \ + | grep -E '^[0-9]+$' || true + fi +} + +collect_gpu_holder_pids() { + index="$1" + { + collect_gpu_compute_pids "$index" + collect_gpu_device_pids "$index" + } | awk 'NF' | sort -u +} + +kill_pid_list() { + pids="$1" + [ -n "$pids" ] || return 0 + + for pid in $pids; do + log_pid_details "$pid" + done + log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')" + for pid in $pids; do + kill -TERM "$pid" >/dev/null 2>&1 || true + done + sleep 1 + for pid in $pids; do + if kill -0 "$pid" >/dev/null 2>&1; then + log "forcing GPU holder PID $pid to exit" + kill -KILL "$pid" >/dev/null 2>&1 || true + fi + done +} + +gpu_has_display_holders() { + index="$1" + holders=$(collect_gpu_device_pids "$index") + [ -n "$holders" ] || return 1 + for pid in $holders; do + comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + case "$comm" in + Xorg|Xwayland|X|gnome-shell) + return 0 + ;; + esac + done + return 1 +} + +stop_nv_hostengine_if_running() { + if pgrep -x nv-hostengine >/dev/null 2>&1; then + pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do [ -n "$line" ] || continue log_blocker "$line" done - log "killing processes matching: $pattern" - pkill -TERM -f "$pattern" >/dev/null 2>&1 || true - sleep 1 - pkill -KILL -f "$pattern" >/dev/null 2>&1 || true + log "stopping nv-hostengine" + pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true + wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true + hostengine_was_active=1 + return 0 fi + return 1 +} + +stop_fabricmanager_if_active() { + if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then + log_blocker "service nvidia-fabricmanager.service" + fabric_was_active=1 + return 0 + fi + return 1 +} + +stop_display_stack_if_active() { + stopped=1 + for unit in display-manager.service lightdm.service; do + if unit_exists "$unit" && stop_unit_if_active "$unit"; then + log_blocker "service $unit" + display_was_active=1 + stopped=0 + fi + done + return "$stopped" +} + +try_gpu_reset() { + index="$1" + log "resetting GPU $index" + nvidia-smi -r -i "$index" } drain_gpu_clients() { display_was_active=0 fabric_was_active=0 - - for unit in display-manager.service lightdm.service; do - if unit_exists "$unit" && stop_unit_if_active "$unit"; then - log_blocker "service $unit" - display_was_active=1 - fi - done - - if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then - log_blocker "service nvidia-fabricmanager.service" - fabric_was_active=1 - fi + hostengine_was_active=0 if pgrep -x nv-hostengine >/dev/null 2>&1; then pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do @@ -98,21 +192,25 @@ drain_gpu_clients() { log "stopping nv-hostengine" pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true + hostengine_was_active=1 fi - for pattern in \ - "nvidia-smi" \ - "dcgmi" \ - "nvvs" \ - "dcgmproftester" \ - "all_reduce_perf" \ - "nvtop" \ - "bee-gpu-burn" \ - "bee-john-gpu-stress" \ - "bee-nccl-gpu-stress" \ - "Xorg" \ - "Xwayland"; do - kill_pattern "$pattern" + if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then + log_blocker "service nvidia-fabricmanager.service" + fabric_was_active=1 + fi + + for unit in display-manager.service lightdm.service; do + if unit_exists "$unit" && stop_unit_if_active "$unit"; then + log_blocker "service $unit" + display_was_active=1 + fi + done + + for dev in /dev/nvidia[0-9]*; do + [ -e "$dev" ] || continue + holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}") + kill_pid_list "$holders" done } @@ -125,7 +223,7 @@ restore_gpu_clients() { fi fi - if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then + if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then log "starting nv-hostengine" nv-hostengine fi @@ -153,10 +251,60 @@ restart_drivers() { reset_gpu() { index="$1" - drain_gpu_clients - log "resetting GPU $index" - nvidia-smi -r -i "$index" + display_was_active=0 + fabric_was_active=0 + hostengine_was_active=0 + + holders=$(collect_gpu_holder_pids "$index") + if [ -n "$holders" ]; then + kill_pid_list "$holders" + fi + if try_gpu_reset "$index"; then + restore_gpu_clients + return 0 + fi + + stop_nv_hostengine_if_running || true + holders=$(collect_gpu_holder_pids "$index") + if [ -n "$holders" ]; then + kill_pid_list "$holders" + fi + if try_gpu_reset "$index"; then + restore_gpu_clients + return 0 + fi + + stop_fabricmanager_if_active || true + holders=$(collect_gpu_holder_pids "$index") + if [ -n "$holders" ]; then + kill_pid_list "$holders" + fi + if try_gpu_reset "$index"; then + restore_gpu_clients + return 0 + fi + + if gpu_has_display_holders "$index"; then + stop_display_stack_if_active || true + holders=$(collect_gpu_holder_pids "$index") + if [ -n "$holders" ]; then + kill_pid_list "$holders" + fi + if try_gpu_reset "$index"; then + restore_gpu_clients + return 0 + fi + fi + + holders=$(collect_gpu_holder_pids "$index") + if [ -n "$holders" ]; then + log "GPU $index still has holders after targeted drain" + kill_pid_list "$holders" + fi + try_gpu_reset "$index" + rc=$? restore_gpu_clients + return "$rc" } cmd="${1:-}"