Unify NVIDIA GPU recovery paths
This commit is contained in:
@@ -105,6 +105,7 @@ var (
|
|||||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||||
benchmarkGeteuid = os.Geteuid
|
benchmarkGeteuid = os.Geteuid
|
||||||
|
benchmarkResetNvidiaGPU = resetNvidiaGPU
|
||||||
benchmarkSleep = time.Sleep
|
benchmarkSleep = time.Sleep
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
|
||||||
|
}
|
||||||
|
out, err := benchmarkResetNvidiaGPU(gpuIndex)
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||||
|
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
|
||||||
|
)
|
||||||
|
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
|
||||||
|
for _, line := range strings.Split(trimmed, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line != "" {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rc := 0
|
||||||
|
if err != nil {
|
||||||
|
rc = 1
|
||||||
|
}
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||||
|
fmt.Sprintf("rc: %d", rc),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||||
if len(gpuIndices) == 0 {
|
if len(gpuIndices) == 0 {
|
||||||
return nil
|
return nil
|
||||||
@@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
|
|||||||
}
|
}
|
||||||
var failed []int
|
var failed []int
|
||||||
for _, idx := range gpuIndices {
|
for _, idx := range gpuIndices {
|
||||||
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
|
||||||
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
|
||||||
failed = append(failed, idx)
|
failed = append(failed, idx)
|
||||||
if logFunc != nil {
|
if logFunc != nil {
|
||||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||||
@@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||||
result.Findings = append(result.Findings,
|
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
|
||||||
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
singlePowerStopCh := make(chan struct{})
|
singlePowerStopCh := make(chan struct{})
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"os"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
oldGeteuid := benchmarkGeteuid
|
oldGeteuid := benchmarkGeteuid
|
||||||
oldExec := satExecCommand
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
benchmarkGeteuid = func() int { return 1000 }
|
benchmarkGeteuid = func() int { return 1000 }
|
||||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||||
t.Fatalf("unexpected command: %s %v", name, args)
|
t.Fatal("unexpected reset call")
|
||||||
return nil
|
return "", nil
|
||||||
}
|
}
|
||||||
t.Cleanup(func() {
|
t.Cleanup(func() {
|
||||||
benchmarkGeteuid = oldGeteuid
|
benchmarkGeteuid = oldGeteuid
|
||||||
satExecCommand = oldExec
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
})
|
})
|
||||||
|
|
||||||
var logs []string
|
var logs []string
|
||||||
@@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
dir := t.TempDir()
|
|
||||||
script := filepath.Join(dir, "nvidia-smi")
|
|
||||||
argsLog := filepath.Join(dir, "args.log")
|
|
||||||
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
|
||||||
t.Fatalf("write script: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
oldGeteuid := benchmarkGeteuid
|
oldGeteuid := benchmarkGeteuid
|
||||||
oldSleep := benchmarkSleep
|
oldSleep := benchmarkSleep
|
||||||
oldLookPath := satLookPath
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
benchmarkGeteuid = func() int { return 0 }
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
benchmarkSleep = func(time.Duration) {}
|
benchmarkSleep = func(time.Duration) {}
|
||||||
satLookPath = func(file string) (string, error) {
|
var calls []int
|
||||||
if file == "nvidia-smi" {
|
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||||
return script, nil
|
calls = append(calls, index)
|
||||||
}
|
return "ok\n", nil
|
||||||
return exec.LookPath(file)
|
|
||||||
}
|
}
|
||||||
t.Cleanup(func() {
|
t.Cleanup(func() {
|
||||||
benchmarkGeteuid = oldGeteuid
|
benchmarkGeteuid = oldGeteuid
|
||||||
benchmarkSleep = oldSleep
|
benchmarkSleep = oldSleep
|
||||||
satLookPath = oldLookPath
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
})
|
})
|
||||||
|
|
||||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||||
if len(failed) != 0 {
|
if len(failed) != 0 {
|
||||||
t.Fatalf("failed=%v want no failures", failed)
|
t.Fatalf("failed=%v want no failures", failed)
|
||||||
}
|
}
|
||||||
raw, err := os.ReadFile(argsLog)
|
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||||
if err != nil {
|
t.Fatalf("calls=%v want %s", calls, want)
|
||||||
t.Fatalf("read args log: %v", err)
|
|
||||||
}
|
}
|
||||||
got := strings.Fields(string(raw))
|
}
|
||||||
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
|
||||||
if strings.Join(got, " ") != strings.Join(want, " ") {
|
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||||
t.Fatalf("args=%v want %v", got, want)
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||||
|
if index == 5 {
|
||||||
|
return "busy\n", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
return "ok\n", nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||||
|
t.Fatalf("failed=%v want %s", failed, want)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package platform
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) {
|
|||||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resetNvidiaGPU(index int) (string, error) {
|
||||||
|
if index < 0 {
|
||||||
|
return "", fmt.Errorf("gpu index must be >= 0")
|
||||||
|
}
|
||||||
|
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||||
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
|
out = "GPU reset completed.\n"
|
||||||
|
}
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func restartNvidiaDrivers() (string, error) {
|
||||||
|
out, err := runNvidiaRecover("restart-drivers")
|
||||||
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
|
out = "NVIDIA drivers restarted.\n"
|
||||||
|
}
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|||||||
@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||||
if index < 0 {
|
return resetNvidiaGPU(index)
|
||||||
return "", fmt.Errorf("gpu index must be >= 0")
|
|
||||||
}
|
|
||||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
|
||||||
if strings.TrimSpace(out) == "" && err == nil {
|
|
||||||
out = "GPU reset completed.\n"
|
|
||||||
}
|
|
||||||
return out, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string {
|
|||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
if name == "bee-nvidia" && action == ServiceRestart {
|
if name == "bee-nvidia" && action == ServiceRestart {
|
||||||
return runNvidiaRecover("restart-drivers")
|
return restartNvidiaDrivers()
|
||||||
}
|
}
|
||||||
// bee-web runs as the bee user; sudo is required to control system services.
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
|
|||||||
@@ -60,35 +60,129 @@ wait_for_process_exit() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
kill_pattern() {
|
log_pid_details() {
|
||||||
pattern="$1"
|
pid="$1"
|
||||||
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
|
||||||
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
if [ -n "$line" ]; then
|
||||||
|
log_blocker "$line"
|
||||||
|
else
|
||||||
|
log_blocker "pid $pid"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_compute_pids() {
|
||||||
|
index="$1"
|
||||||
|
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
nvidia-smi --id="$index" \
|
||||||
|
--query-compute-apps=pid \
|
||||||
|
--format=csv,noheader,nounits 2>/dev/null \
|
||||||
|
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
|
||||||
|
| grep -E '^[0-9]+$' || true
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_device_pids() {
|
||||||
|
index="$1"
|
||||||
|
dev="/dev/nvidia$index"
|
||||||
|
[ -e "$dev" ] || return 0
|
||||||
|
if command -v fuser >/dev/null 2>&1; then
|
||||||
|
fuser "$dev" 2>/dev/null \
|
||||||
|
| tr ' ' '\n' \
|
||||||
|
| sed 's/[^0-9].*$//' \
|
||||||
|
| grep -E '^[0-9]+$' || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
collect_gpu_holder_pids() {
|
||||||
|
index="$1"
|
||||||
|
{
|
||||||
|
collect_gpu_compute_pids "$index"
|
||||||
|
collect_gpu_device_pids "$index"
|
||||||
|
} | awk 'NF' | sort -u
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_pid_list() {
|
||||||
|
pids="$1"
|
||||||
|
[ -n "$pids" ] || return 0
|
||||||
|
|
||||||
|
for pid in $pids; do
|
||||||
|
log_pid_details "$pid"
|
||||||
|
done
|
||||||
|
log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
|
||||||
|
for pid in $pids; do
|
||||||
|
kill -TERM "$pid" >/dev/null 2>&1 || true
|
||||||
|
done
|
||||||
|
sleep 1
|
||||||
|
for pid in $pids; do
|
||||||
|
if kill -0 "$pid" >/dev/null 2>&1; then
|
||||||
|
log "forcing GPU holder PID $pid to exit"
|
||||||
|
kill -KILL "$pid" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
gpu_has_display_holders() {
|
||||||
|
index="$1"
|
||||||
|
holders=$(collect_gpu_device_pids "$index")
|
||||||
|
[ -n "$holders" ] || return 1
|
||||||
|
for pid in $holders; do
|
||||||
|
comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||||
|
case "$comm" in
|
||||||
|
Xorg|Xwayland|X|gnome-shell)
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_nv_hostengine_if_running() {
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||||
[ -n "$line" ] || continue
|
[ -n "$line" ] || continue
|
||||||
log_blocker "$line"
|
log_blocker "$line"
|
||||||
done
|
done
|
||||||
log "killing processes matching: $pattern"
|
log "stopping nv-hostengine"
|
||||||
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
sleep 1
|
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
hostengine_was_active=1
|
||||||
|
return 0
|
||||||
fi
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_fabricmanager_if_active() {
|
||||||
|
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||||
|
log_blocker "service nvidia-fabricmanager.service"
|
||||||
|
fabric_was_active=1
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_display_stack_if_active() {
|
||||||
|
stopped=1
|
||||||
|
for unit in display-manager.service lightdm.service; do
|
||||||
|
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||||
|
log_blocker "service $unit"
|
||||||
|
display_was_active=1
|
||||||
|
stopped=0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return "$stopped"
|
||||||
|
}
|
||||||
|
|
||||||
|
try_gpu_reset() {
|
||||||
|
index="$1"
|
||||||
|
log "resetting GPU $index"
|
||||||
|
nvidia-smi -r -i "$index"
|
||||||
}
|
}
|
||||||
|
|
||||||
drain_gpu_clients() {
|
drain_gpu_clients() {
|
||||||
display_was_active=0
|
display_was_active=0
|
||||||
fabric_was_active=0
|
fabric_was_active=0
|
||||||
|
hostengine_was_active=0
|
||||||
for unit in display-manager.service lightdm.service; do
|
|
||||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
|
||||||
log_blocker "service $unit"
|
|
||||||
display_was_active=1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
|
||||||
log_blocker "service nvidia-fabricmanager.service"
|
|
||||||
fabric_was_active=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||||
@@ -98,21 +192,25 @@ drain_gpu_clients() {
|
|||||||
log "stopping nv-hostengine"
|
log "stopping nv-hostengine"
|
||||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
hostengine_was_active=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for pattern in \
|
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||||
"nvidia-smi" \
|
log_blocker "service nvidia-fabricmanager.service"
|
||||||
"dcgmi" \
|
fabric_was_active=1
|
||||||
"nvvs" \
|
fi
|
||||||
"dcgmproftester" \
|
|
||||||
"all_reduce_perf" \
|
for unit in display-manager.service lightdm.service; do
|
||||||
"nvtop" \
|
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||||
"bee-gpu-burn" \
|
log_blocker "service $unit"
|
||||||
"bee-john-gpu-stress" \
|
display_was_active=1
|
||||||
"bee-nccl-gpu-stress" \
|
fi
|
||||||
"Xorg" \
|
done
|
||||||
"Xwayland"; do
|
|
||||||
kill_pattern "$pattern"
|
for dev in /dev/nvidia[0-9]*; do
|
||||||
|
[ -e "$dev" ] || continue
|
||||||
|
holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
|
||||||
|
kill_pid_list "$holders"
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -125,7 +223,7 @@ restore_gpu_clients() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
log "starting nv-hostengine"
|
log "starting nv-hostengine"
|
||||||
nv-hostengine
|
nv-hostengine
|
||||||
fi
|
fi
|
||||||
@@ -153,10 +251,60 @@ restart_drivers() {
|
|||||||
|
|
||||||
reset_gpu() {
|
reset_gpu() {
|
||||||
index="$1"
|
index="$1"
|
||||||
drain_gpu_clients
|
display_was_active=0
|
||||||
log "resetting GPU $index"
|
fabric_was_active=0
|
||||||
nvidia-smi -r -i "$index"
|
hostengine_was_active=0
|
||||||
|
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
stop_nv_hostengine_if_running || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
stop_fabricmanager_if_active || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if gpu_has_display_holders "$index"; then
|
||||||
|
stop_display_stack_if_active || true
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
if try_gpu_reset "$index"; then
|
||||||
|
restore_gpu_clients
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
holders=$(collect_gpu_holder_pids "$index")
|
||||||
|
if [ -n "$holders" ]; then
|
||||||
|
log "GPU $index still has holders after targeted drain"
|
||||||
|
kill_pid_list "$holders"
|
||||||
|
fi
|
||||||
|
try_gpu_reset "$index"
|
||||||
|
rc=$?
|
||||||
restore_gpu_clients
|
restore_gpu_clients
|
||||||
|
return "$rc"
|
||||||
}
|
}
|
||||||
|
|
||||||
cmd="${1:-}"
|
cmd="${1:-}"
|
||||||
|
|||||||
Reference in New Issue
Block a user