diff --git a/audit/internal/platform/nvidia_recover.go b/audit/internal/platform/nvidia_recover.go new file mode 100644 index 0000000..50cad93 --- /dev/null +++ b/audit/internal/platform/nvidia_recover.go @@ -0,0 +1,30 @@ +package platform + +import ( + "fmt" + "os/exec" + "time" +) + +const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover" + +func runNvidiaRecover(args ...string) (string, error) { + helperArgs := append([]string{nvidiaRecoverHelper}, args...) + if _, err := exec.LookPath("systemd-run"); err == nil { + unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano()) + cmdArgs := []string{ + "systemd-run", + "--quiet", + "--pipe", + "--wait", + "--collect", + "--service-type=oneshot", + "--unit", unit, + } + cmdArgs = append(cmdArgs, helperArgs...) + raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput() + return string(raw), err + } + raw, err := exec.Command("sudo", helperArgs...).CombinedOutput() + return string(raw), err +} diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index fb15b8c..b880370 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) { if index < 0 { return "", fmt.Errorf("gpu index must be >= 0") } - raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput() - if len(raw) == 0 && err == nil { - raw = []byte("GPU reset completed.\n") + out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index)) + if strings.TrimSpace(out) == "" && err == nil { + out = "GPU reset completed.\n" } - return string(raw), err + return out, err } // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs. diff --git a/audit/internal/platform/services.go b/audit/internal/platform/services.go index 7c9d090..a234e17 100644 --- a/audit/internal/platform/services.go +++ b/audit/internal/platform/services.go @@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string { } func (s *System) ServiceDo(name string, action ServiceAction) (string, error) { + if name == "bee-nvidia" && action == ServiceRestart { + return runNvidiaRecover("restart-drivers") + } // bee-web runs as the bee user; sudo is required to control system services. // /etc/sudoers.d/bee grants bee NOPASSWD:ALL. raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput() diff --git a/iso/overlay/usr/local/bin/bee-nvidia-recover b/iso/overlay/usr/local/bin/bee-nvidia-recover new file mode 100755 index 0000000..73cd5db --- /dev/null +++ b/iso/overlay/usr/local/bin/bee-nvidia-recover @@ -0,0 +1,178 @@ +#!/bin/sh +# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers. + +set -u + +log() { + echo "[bee-nvidia-recover] $*" +} + +log_blocker() { + echo "[bee-nvidia-recover] blocker: $*" +} + +usage() { + cat <<'EOF' +usage: + bee-nvidia-recover restart-drivers + bee-nvidia-recover reset-gpu +EOF +} + +unit_exists() { + systemctl cat "$1" >/dev/null 2>&1 +} + +unit_is_active() { + systemctl is-active --quiet "$1" 2>/dev/null +} + +stop_unit_if_active() { + unit="$1" + if unit_is_active "$unit"; then + log "stopping $unit" + systemctl stop "$unit" + return 0 + fi + return 1 +} + +start_unit_if_marked() { + unit="$1" + marker="$2" + if [ "$marker" = "1" ] && unit_exists "$unit"; then + log "starting $unit" + systemctl start "$unit" + fi +} + +wait_for_process_exit() { + name="$1" + tries=0 + while pgrep -x "$name" >/dev/null 2>&1; do + tries=$((tries + 1)) + if [ "$tries" -ge 15 ]; then + log "WARN: $name is still running after stop request" + return 1 + fi + sleep 1 + done + return 0 +} + +kill_pattern() { + pattern="$1" + if pgrep -f "$pattern" >/dev/null 2>&1; then + pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do + [ -n "$line" ] || continue + log_blocker "$line" + done + log "killing processes matching: $pattern" + pkill -TERM -f "$pattern" >/dev/null 2>&1 || true + sleep 1 + pkill -KILL -f "$pattern" >/dev/null 2>&1 || true + fi +} + +drain_gpu_clients() { + display_was_active=0 + fabric_was_active=0 + + for unit in display-manager.service lightdm.service; do + if unit_exists "$unit" && stop_unit_if_active "$unit"; then + log_blocker "service $unit" + display_was_active=1 + fi + done + + if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then + log_blocker "service nvidia-fabricmanager.service" + fabric_was_active=1 + fi + + if pgrep -x nv-hostengine >/dev/null 2>&1; then + pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do + [ -n "$line" ] || continue + log_blocker "$line" + done + log "stopping nv-hostengine" + pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true + wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true + fi + + for pattern in \ + "nvidia-smi" \ + "dcgmi" \ + "nvvs" \ + "dcgmproftester" \ + "all_reduce_perf" \ + "nvtop" \ + "bee-gpu-burn" \ + "bee-john-gpu-stress" \ + "bee-nccl-gpu-stress" \ + "Xorg" \ + "Xwayland"; do + kill_pattern "$pattern" + done +} + +restore_gpu_clients() { + if command -v nvidia-smi >/dev/null 2>&1; then + if nvidia-smi -pm 1 >/dev/null 2>&1; then + log "enabled NVIDIA persistence mode" + else + log "WARN: failed to enable NVIDIA persistence mode" + fi + fi + + if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then + log "starting nv-hostengine" + nv-hostengine + fi + + start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}" + start_unit_if_marked display-manager.service "${display_was_active:-0}" + if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then + start_unit_if_marked lightdm.service "1" + fi +} + +restart_drivers() { + drain_gpu_clients + for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do + if lsmod | awk '{print $1}' | grep -qx "$mod"; then + log "unloading module $mod" + rmmod "$mod" + fi + done + rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true + log "reloading NVIDIA driver stack" + /usr/local/bin/bee-nvidia-load + restore_gpu_clients +} + +reset_gpu() { + index="$1" + drain_gpu_clients + log "resetting GPU $index" + nvidia-smi -r -i "$index" + restore_gpu_clients +} + +cmd="${1:-}" +case "$cmd" in + restart-drivers) + restart_drivers + ;; + reset-gpu) + if [ "$#" -ne 2 ]; then + usage >&2 + exit 2 + fi + reset_gpu "$2" + ;; + *) + usage >&2 + exit 2 + ;; +esac