Unify NVIDIA GPU recovery paths
This commit is contained in:
@@ -105,6 +105,7 @@ var (
|
||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||
benchmarkGeteuid = os.Geteuid
|
||||
benchmarkResetNvidiaGPU = resetNvidiaGPU
|
||||
benchmarkSleep = time.Sleep
|
||||
)
|
||||
|
||||
@@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
||||
return nil
|
||||
}
|
||||
|
||||
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
|
||||
}
|
||||
out, err := benchmarkResetNvidiaGPU(gpuIndex)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
|
||||
)
|
||||
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
}
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||
fmt.Sprintf("rc: %d", rc),
|
||||
"",
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
@@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
|
||||
}
|
||||
var failed []int
|
||||
for _, idx := range gpuIndices {
|
||||
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
|
||||
failed = append(failed, idx)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||
@@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||
result.Findings = append(result.Findings,
|
||||
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
singlePowerStopCh := make(chan struct{})
|
||||
|
||||
@@ -2,7 +2,7 @@ package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
@@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldExec := satExecCommand
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 1000 }
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
t.Fatalf("unexpected command: %s %v", name, args)
|
||||
return nil
|
||||
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||
t.Fatal("unexpected reset call")
|
||||
return "", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
satExecCommand = oldExec
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
var logs []string
|
||||
@@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "nvidia-smi")
|
||||
argsLog := filepath.Join(dir, "args.log")
|
||||
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldLookPath := satLookPath
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "nvidia-smi" {
|
||||
return script, nil
|
||||
}
|
||||
return exec.LookPath(file)
|
||||
var calls []int
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
calls = append(calls, index)
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
satLookPath = oldLookPath
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if len(failed) != 0 {
|
||||
t.Fatalf("failed=%v want no failures", failed)
|
||||
}
|
||||
raw, err := os.ReadFile(argsLog)
|
||||
if err != nil {
|
||||
t.Fatalf("read args log: %v", err)
|
||||
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||
t.Fatalf("calls=%v want %s", calls, want)
|
||||
}
|
||||
got := strings.Fields(string(raw))
|
||||
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||
t.Fatalf("args=%v want %v", got, want)
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
if index == 5 {
|
||||
return "busy\n", exec.ErrNotFound
|
||||
}
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||
t.Fatalf("failed=%v want %s", failed, want)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,8 @@ package platform
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) {
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
func resetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
func restartNvidiaDrivers() (string, error) {
|
||||
out, err := runNvidiaRecover("restart-drivers")
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "NVIDIA drivers restarted.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
|
||||
}
|
||||
|
||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
return resetNvidiaGPU(index)
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
|
||||
@@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string {
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return runNvidiaRecover("restart-drivers")
|
||||
return restartNvidiaDrivers()
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
|
||||
@@ -60,35 +60,129 @@ wait_for_process_exit() {
|
||||
return 0
|
||||
}
|
||||
|
||||
kill_pattern() {
|
||||
pattern="$1"
|
||||
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
||||
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
||||
log_pid_details() {
|
||||
pid="$1"
|
||||
line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
|
||||
if [ -n "$line" ]; then
|
||||
log_blocker "$line"
|
||||
else
|
||||
log_blocker "pid $pid"
|
||||
fi
|
||||
}
|
||||
|
||||
collect_gpu_compute_pids() {
|
||||
index="$1"
|
||||
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
nvidia-smi --id="$index" \
|
||||
--query-compute-apps=pid \
|
||||
--format=csv,noheader,nounits 2>/dev/null \
|
||||
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
|
||||
| grep -E '^[0-9]+$' || true
|
||||
}
|
||||
|
||||
collect_gpu_device_pids() {
|
||||
index="$1"
|
||||
dev="/dev/nvidia$index"
|
||||
[ -e "$dev" ] || return 0
|
||||
if command -v fuser >/dev/null 2>&1; then
|
||||
fuser "$dev" 2>/dev/null \
|
||||
| tr ' ' '\n' \
|
||||
| sed 's/[^0-9].*$//' \
|
||||
| grep -E '^[0-9]+$' || true
|
||||
fi
|
||||
}
|
||||
|
||||
collect_gpu_holder_pids() {
|
||||
index="$1"
|
||||
{
|
||||
collect_gpu_compute_pids "$index"
|
||||
collect_gpu_device_pids "$index"
|
||||
} | awk 'NF' | sort -u
|
||||
}
|
||||
|
||||
kill_pid_list() {
|
||||
pids="$1"
|
||||
[ -n "$pids" ] || return 0
|
||||
|
||||
for pid in $pids; do
|
||||
log_pid_details "$pid"
|
||||
done
|
||||
log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
|
||||
for pid in $pids; do
|
||||
kill -TERM "$pid" >/dev/null 2>&1 || true
|
||||
done
|
||||
sleep 1
|
||||
for pid in $pids; do
|
||||
if kill -0 "$pid" >/dev/null 2>&1; then
|
||||
log "forcing GPU holder PID $pid to exit"
|
||||
kill -KILL "$pid" >/dev/null 2>&1 || true
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
gpu_has_display_holders() {
|
||||
index="$1"
|
||||
holders=$(collect_gpu_device_pids "$index")
|
||||
[ -n "$holders" ] || return 1
|
||||
for pid in $holders; do
|
||||
comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||
case "$comm" in
|
||||
Xorg|Xwayland|X|gnome-shell)
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_nv_hostengine_if_running() {
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "killing processes matching: $pattern"
|
||||
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
||||
sleep 1
|
||||
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
hostengine_was_active=1
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_fabricmanager_if_active() {
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_display_stack_if_active() {
|
||||
stopped=1
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
stopped=0
|
||||
fi
|
||||
done
|
||||
return "$stopped"
|
||||
}
|
||||
|
||||
try_gpu_reset() {
|
||||
index="$1"
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
}
|
||||
|
||||
drain_gpu_clients() {
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
hostengine_was_active=0
|
||||
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
@@ -98,21 +192,25 @@ drain_gpu_clients() {
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
hostengine_was_active=1
|
||||
fi
|
||||
|
||||
for pattern in \
|
||||
"nvidia-smi" \
|
||||
"dcgmi" \
|
||||
"nvvs" \
|
||||
"dcgmproftester" \
|
||||
"all_reduce_perf" \
|
||||
"nvtop" \
|
||||
"bee-gpu-burn" \
|
||||
"bee-john-gpu-stress" \
|
||||
"bee-nccl-gpu-stress" \
|
||||
"Xorg" \
|
||||
"Xwayland"; do
|
||||
kill_pattern "$pattern"
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
for dev in /dev/nvidia[0-9]*; do
|
||||
[ -e "$dev" ] || continue
|
||||
holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
|
||||
kill_pid_list "$holders"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -125,7 +223,7 @@ restore_gpu_clients() {
|
||||
fi
|
||||
fi
|
||||
|
||||
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "starting nv-hostengine"
|
||||
nv-hostengine
|
||||
fi
|
||||
@@ -153,10 +251,60 @@ restart_drivers() {
|
||||
|
||||
reset_gpu() {
|
||||
index="$1"
|
||||
drain_gpu_clients
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
hostengine_was_active=0
|
||||
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
stop_nv_hostengine_if_running || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
stop_fabricmanager_if_active || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
if gpu_has_display_holders "$index"; then
|
||||
stop_display_stack_if_active || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
log "GPU $index still has holders after targeted drain"
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
try_gpu_reset "$index"
|
||||
rc=$?
|
||||
restore_gpu_clients
|
||||
return "$rc"
|
||||
}
|
||||
|
||||
cmd="${1:-}"
|
||||
|
||||
Reference in New Issue
Block a user