Fix NVIDIA self-heal recovery flow

This commit is contained in:
Mikhail Chusavitin
2026-04-20 09:43:22 +03:00
parent 1cfabc9230
commit 84a2551dc0
4 changed files with 215 additions and 4 deletions

View File

@@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
if index < 0 {
return "", fmt.Errorf("gpu index must be >= 0")
}
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
if len(raw) == 0 && err == nil {
raw = []byte("GPU reset completed.\n")
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
if strings.TrimSpace(out) == "" && err == nil {
out = "GPU reset completed.\n"
}
return string(raw), err
return out, err
}
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.