Unify NVIDIA GPU recovery paths
This commit is contained in:
@@ -60,35 +60,129 @@ wait_for_process_exit() {
|
||||
return 0
|
||||
}
|
||||
|
||||
kill_pattern() {
|
||||
pattern="$1"
|
||||
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
||||
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
||||
log_pid_details() {
|
||||
pid="$1"
|
||||
line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
|
||||
if [ -n "$line" ]; then
|
||||
log_blocker "$line"
|
||||
else
|
||||
log_blocker "pid $pid"
|
||||
fi
|
||||
}
|
||||
|
||||
collect_gpu_compute_pids() {
|
||||
index="$1"
|
||||
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
nvidia-smi --id="$index" \
|
||||
--query-compute-apps=pid \
|
||||
--format=csv,noheader,nounits 2>/dev/null \
|
||||
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
|
||||
| grep -E '^[0-9]+$' || true
|
||||
}
|
||||
|
||||
collect_gpu_device_pids() {
|
||||
index="$1"
|
||||
dev="/dev/nvidia$index"
|
||||
[ -e "$dev" ] || return 0
|
||||
if command -v fuser >/dev/null 2>&1; then
|
||||
fuser "$dev" 2>/dev/null \
|
||||
| tr ' ' '\n' \
|
||||
| sed 's/[^0-9].*$//' \
|
||||
| grep -E '^[0-9]+$' || true
|
||||
fi
|
||||
}
|
||||
|
||||
collect_gpu_holder_pids() {
|
||||
index="$1"
|
||||
{
|
||||
collect_gpu_compute_pids "$index"
|
||||
collect_gpu_device_pids "$index"
|
||||
} | awk 'NF' | sort -u
|
||||
}
|
||||
|
||||
kill_pid_list() {
|
||||
pids="$1"
|
||||
[ -n "$pids" ] || return 0
|
||||
|
||||
for pid in $pids; do
|
||||
log_pid_details "$pid"
|
||||
done
|
||||
log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
|
||||
for pid in $pids; do
|
||||
kill -TERM "$pid" >/dev/null 2>&1 || true
|
||||
done
|
||||
sleep 1
|
||||
for pid in $pids; do
|
||||
if kill -0 "$pid" >/dev/null 2>&1; then
|
||||
log "forcing GPU holder PID $pid to exit"
|
||||
kill -KILL "$pid" >/dev/null 2>&1 || true
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
gpu_has_display_holders() {
|
||||
index="$1"
|
||||
holders=$(collect_gpu_device_pids "$index")
|
||||
[ -n "$holders" ] || return 1
|
||||
for pid in $holders; do
|
||||
comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||
case "$comm" in
|
||||
Xorg|Xwayland|X|gnome-shell)
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_nv_hostengine_if_running() {
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "killing processes matching: $pattern"
|
||||
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
||||
sleep 1
|
||||
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
hostengine_was_active=1
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_fabricmanager_if_active() {
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_display_stack_if_active() {
|
||||
stopped=1
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
stopped=0
|
||||
fi
|
||||
done
|
||||
return "$stopped"
|
||||
}
|
||||
|
||||
try_gpu_reset() {
|
||||
index="$1"
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
}
|
||||
|
||||
drain_gpu_clients() {
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
hostengine_was_active=0
|
||||
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
@@ -98,21 +192,25 @@ drain_gpu_clients() {
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
hostengine_was_active=1
|
||||
fi
|
||||
|
||||
for pattern in \
|
||||
"nvidia-smi" \
|
||||
"dcgmi" \
|
||||
"nvvs" \
|
||||
"dcgmproftester" \
|
||||
"all_reduce_perf" \
|
||||
"nvtop" \
|
||||
"bee-gpu-burn" \
|
||||
"bee-john-gpu-stress" \
|
||||
"bee-nccl-gpu-stress" \
|
||||
"Xorg" \
|
||||
"Xwayland"; do
|
||||
kill_pattern "$pattern"
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
for dev in /dev/nvidia[0-9]*; do
|
||||
[ -e "$dev" ] || continue
|
||||
holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
|
||||
kill_pid_list "$holders"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -125,7 +223,7 @@ restore_gpu_clients() {
|
||||
fi
|
||||
fi
|
||||
|
||||
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "starting nv-hostengine"
|
||||
nv-hostengine
|
||||
fi
|
||||
@@ -153,10 +251,60 @@ restart_drivers() {
|
||||
|
||||
reset_gpu() {
|
||||
index="$1"
|
||||
drain_gpu_clients
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
hostengine_was_active=0
|
||||
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
stop_nv_hostengine_if_running || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
stop_fabricmanager_if_active || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
if gpu_has_display_holders "$index"; then
|
||||
stop_display_stack_if_active || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
log "GPU $index still has holders after targeted drain"
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
try_gpu_reset "$index"
|
||||
rc=$?
|
||||
restore_gpu_clients
|
||||
return "$rc"
|
||||
}
|
||||
|
||||
cmd="${1:-}"
|
||||
|
||||
Reference in New Issue
Block a user