Unify NVIDIA GPU recovery paths

2026-04-23 20:31:41 +03:00
parent 6112094d45
commit 749fc8a94d
6 changed files with 278 additions and 82 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -105,6 +105,7 @@ var (
 	benchmarkSkippedPattern    = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
 	benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
 	benchmarkGeteuid           = os.Geteuid
+	benchmarkResetNvidiaGPU    = resetNvidiaGPU
 	benchmarkSleep             = time.Sleep
 )

@@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
 	return nil
 }

+func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
+	if logFunc != nil {
+		logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
+	}
+	out, err := benchmarkResetNvidiaGPU(gpuIndex)
+	appendSATVerboseLog(verboseLog,
+		fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
+		"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
+	)
+	if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
+		for _, line := range strings.Split(trimmed, "\n") {
+			line = strings.TrimSpace(line)
+			if line != "" {
+				logFunc(line)
+			}
+		}
+	}
+	rc := 0
+	if err != nil {
+		rc = 1
+	}
+	appendSATVerboseLog(verboseLog,
+		fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
+		fmt.Sprintf("rc: %d", rc),
+		"",
+	)
+	return err
+}
+
 func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
 	if len(gpuIndices) == 0 {
 		return nil
@@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
 	}
 	var failed []int
 	for _, idx := range gpuIndices {
-		name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
-		if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
+		if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
 			failed = append(failed, idx)
 			if logFunc != nil {
 				logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
@@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
-			result.Findings = append(result.Findings,
-				fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
+			return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
 		}
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
 		singlePowerStopCh := make(chan struct{})
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -2,7 +2,7 @@ package platform

 import (
 	"context"
-	"os"
+	"fmt"
 	"os/exec"
 	"path/filepath"
 	"strings"
@@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
 }

 func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
-	t.Parallel()
-
 	oldGeteuid := benchmarkGeteuid
-	oldExec := satExecCommand
+	oldReset := benchmarkResetNvidiaGPU
 	benchmarkGeteuid = func() int { return 1000 }
-	satExecCommand = func(name string, args ...string) *exec.Cmd {
-		t.Fatalf("unexpected command: %s %v", name, args)
-		return nil
+	benchmarkResetNvidiaGPU = func(int) (string, error) {
+		t.Fatal("unexpected reset call")
+		return "", nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
-		satExecCommand = oldExec
+		benchmarkResetNvidiaGPU = oldReset
 	})

 	var logs []string
@@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
 }

 func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
-	t.Parallel()
-
-	dir := t.TempDir()
-	script := filepath.Join(dir, "nvidia-smi")
-	argsLog := filepath.Join(dir, "args.log")
-	if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
-		t.Fatalf("write script: %v", err)
-	}
-
 	oldGeteuid := benchmarkGeteuid
 	oldSleep := benchmarkSleep
-	oldLookPath := satLookPath
+	oldReset := benchmarkResetNvidiaGPU
 	benchmarkGeteuid = func() int { return 0 }
 	benchmarkSleep = func(time.Duration) {}
-	satLookPath = func(file string) (string, error) {
-		if file == "nvidia-smi" {
-			return script, nil
-		}
-		return exec.LookPath(file)
+	var calls []int
+	benchmarkResetNvidiaGPU = func(index int) (string, error) {
+		calls = append(calls, index)
+		return "ok\n", nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		benchmarkSleep = oldSleep
-		satLookPath = oldLookPath
+		benchmarkResetNvidiaGPU = oldReset
 	})

-	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
 	if len(failed) != 0 {
 		t.Fatalf("failed=%v want no failures", failed)
 	}
-	raw, err := os.ReadFile(argsLog)
-	if err != nil {
-		t.Fatalf("read args log: %v", err)
+	if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
+		t.Fatalf("calls=%v want %s", calls, want)
 	}
-	got := strings.Fields(string(raw))
-	want := []string{"-i", "2", "-r", "-i", "5", "-r"}
-	if strings.Join(got, " ") != strings.Join(want, " ") {
-		t.Fatalf("args=%v want %v", got, want)
+}
+
+func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
+	oldGeteuid := benchmarkGeteuid
+	oldSleep := benchmarkSleep
+	oldReset := benchmarkResetNvidiaGPU
+	benchmarkGeteuid = func() int { return 0 }
+	benchmarkSleep = func(time.Duration) {}
+	benchmarkResetNvidiaGPU = func(index int) (string, error) {
+		if index == 5 {
+			return "busy\n", exec.ErrNotFound
+		}
+		return "ok\n", nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkSleep = oldSleep
+		benchmarkResetNvidiaGPU = oldReset
+	})
+
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
+	if got, want := fmt.Sprint(failed), "[5]"; got != want {
+		t.Fatalf("failed=%v want %s", failed, want)
 	}
 }

--- a/audit/internal/platform/nvidia_recover.go
+++ b/audit/internal/platform/nvidia_recover.go
@@ -3,6 +3,8 @@ package platform
 import (
 	"fmt"
 	"os/exec"
+	"strconv"
+	"strings"
 	"time"
 )

@@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) {
 	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
 	return string(raw), err
 }
+
+func resetNvidiaGPU(index int) (string, error) {
+	if index < 0 {
+		return "", fmt.Errorf("gpu index must be >= 0")
+	}
+	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "GPU reset completed.\n"
+	}
+	return out, err
+}
+
+func restartNvidiaDrivers() (string, error) {
+	out, err := runNvidiaRecover("restart-drivers")
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "NVIDIA drivers restarted.\n"
+	}
+	return out, err
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
 }

 func (s *System) ResetNvidiaGPU(index int) (string, error) {
-	if index < 0 {
-		return "", fmt.Errorf("gpu index must be >= 0")
-	}
-	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
-	if strings.TrimSpace(out) == "" && err == nil {
-		out = "GPU reset completed.\n"
-	}
-	return out, err
+	return resetNvidiaGPU(index)
 }

 // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string {

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
 	if name == "bee-nvidia" && action == ServiceRestart {
-		return runNvidiaRecover("restart-drivers")
+		return restartNvidiaDrivers()
 	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
--- a/iso/overlay/usr/local/bin/bee-nvidia-recover
+++ b/iso/overlay/usr/local/bin/bee-nvidia-recover
@@ -60,35 +60,129 @@ wait_for_process_exit() {
    return 0
 }

-kill_pattern() {
-    pattern="$1"
-    if pgrep -f "$pattern" >/dev/null 2>&1; then
-        pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
+log_pid_details() {
+    pid="$1"
+    line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
+    if [ -n "$line" ]; then
+        log_blocker "$line"
+    else
+        log_blocker "pid $pid"
+    fi
+}
+
+collect_gpu_compute_pids() {
+    index="$1"
+    if ! command -v nvidia-smi >/dev/null 2>&1; then
+        return 0
+    fi
+    nvidia-smi --id="$index" \
+        --query-compute-apps=pid \
+        --format=csv,noheader,nounits 2>/dev/null \
+        | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
+        | grep -E '^[0-9]+$' || true
+}
+
+collect_gpu_device_pids() {
+    index="$1"
+    dev="/dev/nvidia$index"
+    [ -e "$dev" ] || return 0
+    if command -v fuser >/dev/null 2>&1; then
+        fuser "$dev" 2>/dev/null \
+            | tr ' ' '\n' \
+            | sed 's/[^0-9].*$//' \
+            | grep -E '^[0-9]+$' || true
+    fi
+}
+
+collect_gpu_holder_pids() {
+    index="$1"
+    {
+        collect_gpu_compute_pids "$index"
+        collect_gpu_device_pids "$index"
+    } | awk 'NF' | sort -u
+}
+
+kill_pid_list() {
+    pids="$1"
+    [ -n "$pids" ] || return 0
+
+    for pid in $pids; do
+        log_pid_details "$pid"
+    done
+    log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
+    for pid in $pids; do
+        kill -TERM "$pid" >/dev/null 2>&1 || true
+    done
+    sleep 1
+    for pid in $pids; do
+        if kill -0 "$pid" >/dev/null 2>&1; then
+            log "forcing GPU holder PID $pid to exit"
+            kill -KILL "$pid" >/dev/null 2>&1 || true
+        fi
+    done
+}
+
+gpu_has_display_holders() {
+    index="$1"
+    holders=$(collect_gpu_device_pids "$index")
+    [ -n "$holders" ] || return 1
+    for pid in $holders; do
+        comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+        case "$comm" in
+            Xorg|Xwayland|X|gnome-shell)
+                return 0
+                ;;
+        esac
+    done
+    return 1
+}
+
+stop_nv_hostengine_if_running() {
+    if pgrep -x nv-hostengine >/dev/null 2>&1; then
+        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
            [ -n "$line" ] || continue
            log_blocker "$line"
        done
-        log "killing processes matching: $pattern"
-        pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
-        sleep 1
-        pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
+        log "stopping nv-hostengine"
+        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
+        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
+        hostengine_was_active=1
+        return 0
    fi
+    return 1
+}
+
+stop_fabricmanager_if_active() {
+    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
+        log_blocker "service nvidia-fabricmanager.service"
+        fabric_was_active=1
+        return 0
+    fi
+    return 1
+}
+
+stop_display_stack_if_active() {
+    stopped=1
+    for unit in display-manager.service lightdm.service; do
+        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
+            log_blocker "service $unit"
+            display_was_active=1
+            stopped=0
+        fi
+    done
+    return "$stopped"
+}
+
+try_gpu_reset() {
+    index="$1"
+    log "resetting GPU $index"
+    nvidia-smi -r -i "$index"
 }

 drain_gpu_clients() {
    display_was_active=0
    fabric_was_active=0
-
-    for unit in display-manager.service lightdm.service; do
-        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
-            log_blocker "service $unit"
-            display_was_active=1
-        fi
-    done
-
-    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
-        log_blocker "service nvidia-fabricmanager.service"
-        fabric_was_active=1
-    fi
+    hostengine_was_active=0

    if pgrep -x nv-hostengine >/dev/null 2>&1; then
        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
@@ -98,21 +192,25 @@ drain_gpu_clients() {
        log "stopping nv-hostengine"
        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
+        hostengine_was_active=1
    fi

-    for pattern in \
-        "nvidia-smi" \
-        "dcgmi" \
-        "nvvs" \
-        "dcgmproftester" \
-        "all_reduce_perf" \
-        "nvtop" \
-        "bee-gpu-burn" \
-        "bee-john-gpu-stress" \
-        "bee-nccl-gpu-stress" \
-        "Xorg" \
-        "Xwayland"; do
-        kill_pattern "$pattern"
+    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
+        log_blocker "service nvidia-fabricmanager.service"
+        fabric_was_active=1
+    fi
+
+    for unit in display-manager.service lightdm.service; do
+        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
+            log_blocker "service $unit"
+            display_was_active=1
+        fi
+    done
+
+    for dev in /dev/nvidia[0-9]*; do
+        [ -e "$dev" ] || continue
+        holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
+        kill_pid_list "$holders"
    done
 }

@@ -125,7 +223,7 @@ restore_gpu_clients() {
        fi
    fi

-    if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
+    if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        log "starting nv-hostengine"
        nv-hostengine
    fi
@@ -153,10 +251,60 @@ restart_drivers() {

 reset_gpu() {
    index="$1"
-    drain_gpu_clients
-    log "resetting GPU $index"
-    nvidia-smi -r -i "$index"
+    display_was_active=0
+    fabric_was_active=0
+    hostengine_was_active=0
+
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    stop_nv_hostengine_if_running || true
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    stop_fabricmanager_if_active || true
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    if gpu_has_display_holders "$index"; then
+        stop_display_stack_if_active || true
+        holders=$(collect_gpu_holder_pids "$index")
+        if [ -n "$holders" ]; then
+            kill_pid_list "$holders"
+        fi
+        if try_gpu_reset "$index"; then
+            restore_gpu_clients
+            return 0
+        fi
+    fi
+
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        log "GPU $index still has holders after targeted drain"
+        kill_pid_list "$holders"
+    fi
+    try_gpu_reset "$index"
+    rc=$?
    restore_gpu_clients
+    return "$rc"
 }

 cmd="${1:-}"