Fix NVIDIA self-heal recovery flow
This commit is contained in:
30
audit/internal/platform/nvidia_recover.go
Normal file
30
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"time"
|
||||
)
|
||||
|
||||
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||
|
||||
func runNvidiaRecover(args ...string) (string, error) {
|
||||
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||
cmdArgs := []string{
|
||||
"systemd-run",
|
||||
"--quiet",
|
||||
"--pipe",
|
||||
"--wait",
|
||||
"--collect",
|
||||
"--service-type=oneshot",
|
||||
"--unit", unit,
|
||||
}
|
||||
cmdArgs = append(cmdArgs, helperArgs...)
|
||||
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
@@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||
if len(raw) == 0 && err == nil {
|
||||
raw = []byte("GPU reset completed.\n")
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return string(raw), err
|
||||
return out, err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
|
||||
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return runNvidiaRecover("restart-drivers")
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
|
||||
Reference in New Issue
Block a user