Unify NVIDIA GPU recovery paths

This commit is contained in:
2026-04-23 20:31:41 +03:00
parent 6112094d45
commit 749fc8a94d
6 changed files with 278 additions and 82 deletions

View File

@@ -105,6 +105,7 @@ var (
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
benchmarkGeteuid = os.Geteuid
benchmarkResetNvidiaGPU = resetNvidiaGPU
benchmarkSleep = time.Sleep
)
@@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
return nil
}
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
}
out, err := benchmarkResetNvidiaGPU(gpuIndex)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
)
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
for _, line := range strings.Split(trimmed, "\n") {
line = strings.TrimSpace(line)
if line != "" {
logFunc(line)
}
}
}
rc := 0
if err != nil {
rc = 1
}
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
fmt.Sprintf("rc: %d", rc),
"",
)
return err
}
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
if len(gpuIndices) == 0 {
return nil
@@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
}
var failed []int
for _, idx := range gpuIndices {
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
failed = append(failed, idx)
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
@@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
result.Findings = append(result.Findings,
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
}
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
singlePowerStopCh := make(chan struct{})

View File

@@ -2,7 +2,7 @@ package platform
import (
"context"
"os"
"fmt"
"os/exec"
"path/filepath"
"strings"
@@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
}
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
t.Parallel()
oldGeteuid := benchmarkGeteuid
oldExec := satExecCommand
oldReset := benchmarkResetNvidiaGPU
benchmarkGeteuid = func() int { return 1000 }
satExecCommand = func(name string, args ...string) *exec.Cmd {
t.Fatalf("unexpected command: %s %v", name, args)
return nil
benchmarkResetNvidiaGPU = func(int) (string, error) {
t.Fatal("unexpected reset call")
return "", nil
}
t.Cleanup(func() {
benchmarkGeteuid = oldGeteuid
satExecCommand = oldExec
benchmarkResetNvidiaGPU = oldReset
})
var logs []string
@@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
}
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
t.Parallel()
dir := t.TempDir()
script := filepath.Join(dir, "nvidia-smi")
argsLog := filepath.Join(dir, "args.log")
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
t.Fatalf("write script: %v", err)
}
oldGeteuid := benchmarkGeteuid
oldSleep := benchmarkSleep
oldLookPath := satLookPath
oldReset := benchmarkResetNvidiaGPU
benchmarkGeteuid = func() int { return 0 }
benchmarkSleep = func(time.Duration) {}
satLookPath = func(file string) (string, error) {
if file == "nvidia-smi" {
return script, nil
}
return exec.LookPath(file)
var calls []int
benchmarkResetNvidiaGPU = func(index int) (string, error) {
calls = append(calls, index)
return "ok\n", nil
}
t.Cleanup(func() {
benchmarkGeteuid = oldGeteuid
benchmarkSleep = oldSleep
satLookPath = oldLookPath
benchmarkResetNvidiaGPU = oldReset
})
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
if len(failed) != 0 {
t.Fatalf("failed=%v want no failures", failed)
}
raw, err := os.ReadFile(argsLog)
if err != nil {
t.Fatalf("read args log: %v", err)
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
t.Fatalf("calls=%v want %s", calls, want)
}
got := strings.Fields(string(raw))
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
if strings.Join(got, " ") != strings.Join(want, " ") {
t.Fatalf("args=%v want %v", got, want)
}
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
oldGeteuid := benchmarkGeteuid
oldSleep := benchmarkSleep
oldReset := benchmarkResetNvidiaGPU
benchmarkGeteuid = func() int { return 0 }
benchmarkSleep = func(time.Duration) {}
benchmarkResetNvidiaGPU = func(index int) (string, error) {
if index == 5 {
return "busy\n", exec.ErrNotFound
}
return "ok\n", nil
}
t.Cleanup(func() {
benchmarkGeteuid = oldGeteuid
benchmarkSleep = oldSleep
benchmarkResetNvidiaGPU = oldReset
})
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
if got, want := fmt.Sprint(failed), "[5]"; got != want {
t.Fatalf("failed=%v want %s", failed, want)
}
}

View File

@@ -3,6 +3,8 @@ package platform
import (
"fmt"
"os/exec"
"strconv"
"strings"
"time"
)
@@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) {
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
return string(raw), err
}
func resetNvidiaGPU(index int) (string, error) {
if index < 0 {
return "", fmt.Errorf("gpu index must be >= 0")
}
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
if strings.TrimSpace(out) == "" && err == nil {
out = "GPU reset completed.\n"
}
return out, err
}
func restartNvidiaDrivers() (string, error) {
out, err := runNvidiaRecover("restart-drivers")
if strings.TrimSpace(out) == "" && err == nil {
out = "NVIDIA drivers restarted.\n"
}
return out, err
}

View File

@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
}
func (s *System) ResetNvidiaGPU(index int) (string, error) {
if index < 0 {
return "", fmt.Errorf("gpu index must be >= 0")
}
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
if strings.TrimSpace(out) == "" && err == nil {
out = "GPU reset completed.\n"
}
return out, err
return resetNvidiaGPU(index)
}
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.

View File

@@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string {
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
if name == "bee-nvidia" && action == ServiceRestart {
return runNvidiaRecover("restart-drivers")
return restartNvidiaDrivers()
}
// bee-web runs as the bee user; sudo is required to control system services.
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.

View File

@@ -60,35 +60,129 @@ wait_for_process_exit() {
return 0
}
kill_pattern() {
pattern="$1"
if pgrep -f "$pattern" >/dev/null 2>&1; then
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
log_pid_details() {
pid="$1"
line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
if [ -n "$line" ]; then
log_blocker "$line"
else
log_blocker "pid $pid"
fi
}
collect_gpu_compute_pids() {
index="$1"
if ! command -v nvidia-smi >/dev/null 2>&1; then
return 0
fi
nvidia-smi --id="$index" \
--query-compute-apps=pid \
--format=csv,noheader,nounits 2>/dev/null \
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
| grep -E '^[0-9]+$' || true
}
collect_gpu_device_pids() {
index="$1"
dev="/dev/nvidia$index"
[ -e "$dev" ] || return 0
if command -v fuser >/dev/null 2>&1; then
fuser "$dev" 2>/dev/null \
| tr ' ' '\n' \
| sed 's/[^0-9].*$//' \
| grep -E '^[0-9]+$' || true
fi
}
collect_gpu_holder_pids() {
index="$1"
{
collect_gpu_compute_pids "$index"
collect_gpu_device_pids "$index"
} | awk 'NF' | sort -u
}
kill_pid_list() {
pids="$1"
[ -n "$pids" ] || return 0
for pid in $pids; do
log_pid_details "$pid"
done
log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
for pid in $pids; do
kill -TERM "$pid" >/dev/null 2>&1 || true
done
sleep 1
for pid in $pids; do
if kill -0 "$pid" >/dev/null 2>&1; then
log "forcing GPU holder PID $pid to exit"
kill -KILL "$pid" >/dev/null 2>&1 || true
fi
done
}
gpu_has_display_holders() {
index="$1"
holders=$(collect_gpu_device_pids "$index")
[ -n "$holders" ] || return 1
for pid in $holders; do
comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
case "$comm" in
Xorg|Xwayland|X|gnome-shell)
return 0
;;
esac
done
return 1
}
stop_nv_hostengine_if_running() {
if pgrep -x nv-hostengine >/dev/null 2>&1; then
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
[ -n "$line" ] || continue
log_blocker "$line"
done
log "killing processes matching: $pattern"
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
sleep 1
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
log "stopping nv-hostengine"
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
hostengine_was_active=1
return 0
fi
return 1
}
stop_fabricmanager_if_active() {
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
log_blocker "service nvidia-fabricmanager.service"
fabric_was_active=1
return 0
fi
return 1
}
stop_display_stack_if_active() {
stopped=1
for unit in display-manager.service lightdm.service; do
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
log_blocker "service $unit"
display_was_active=1
stopped=0
fi
done
return "$stopped"
}
try_gpu_reset() {
index="$1"
log "resetting GPU $index"
nvidia-smi -r -i "$index"
}
drain_gpu_clients() {
display_was_active=0
fabric_was_active=0
for unit in display-manager.service lightdm.service; do
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
log_blocker "service $unit"
display_was_active=1
fi
done
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
log_blocker "service nvidia-fabricmanager.service"
fabric_was_active=1
fi
hostengine_was_active=0
if pgrep -x nv-hostengine >/dev/null 2>&1; then
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
@@ -98,21 +192,25 @@ drain_gpu_clients() {
log "stopping nv-hostengine"
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
hostengine_was_active=1
fi
for pattern in \
"nvidia-smi" \
"dcgmi" \
"nvvs" \
"dcgmproftester" \
"all_reduce_perf" \
"nvtop" \
"bee-gpu-burn" \
"bee-john-gpu-stress" \
"bee-nccl-gpu-stress" \
"Xorg" \
"Xwayland"; do
kill_pattern "$pattern"
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
log_blocker "service nvidia-fabricmanager.service"
fabric_was_active=1
fi
for unit in display-manager.service lightdm.service; do
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
log_blocker "service $unit"
display_was_active=1
fi
done
for dev in /dev/nvidia[0-9]*; do
[ -e "$dev" ] || continue
holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
kill_pid_list "$holders"
done
}
@@ -125,7 +223,7 @@ restore_gpu_clients() {
fi
fi
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
log "starting nv-hostengine"
nv-hostengine
fi
@@ -153,10 +251,60 @@ restart_drivers() {
reset_gpu() {
index="$1"
drain_gpu_clients
log "resetting GPU $index"
nvidia-smi -r -i "$index"
display_was_active=0
fabric_was_active=0
hostengine_was_active=0
holders=$(collect_gpu_holder_pids "$index")
if [ -n "$holders" ]; then
kill_pid_list "$holders"
fi
if try_gpu_reset "$index"; then
restore_gpu_clients
return 0
fi
stop_nv_hostengine_if_running || true
holders=$(collect_gpu_holder_pids "$index")
if [ -n "$holders" ]; then
kill_pid_list "$holders"
fi
if try_gpu_reset "$index"; then
restore_gpu_clients
return 0
fi
stop_fabricmanager_if_active || true
holders=$(collect_gpu_holder_pids "$index")
if [ -n "$holders" ]; then
kill_pid_list "$holders"
fi
if try_gpu_reset "$index"; then
restore_gpu_clients
return 0
fi
if gpu_has_display_holders "$index"; then
stop_display_stack_if_active || true
holders=$(collect_gpu_holder_pids "$index")
if [ -n "$holders" ]; then
kill_pid_list "$holders"
fi
if try_gpu_reset "$index"; then
restore_gpu_clients
return 0
fi
fi
holders=$(collect_gpu_holder_pids "$index")
if [ -n "$holders" ]; then
log "GPU $index still has holders after targeted drain"
kill_pid_list "$holders"
fi
try_gpu_reset "$index"
rc=$?
restore_gpu_clients
return "$rc"
}
cmd="${1:-}"