Unify NVIDIA GPU recovery paths

This commit is contained in:
2026-04-23 20:31:41 +03:00
parent 6112094d45
commit 749fc8a94d
6 changed files with 278 additions and 82 deletions

View File

@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
}
func (s *System) ResetNvidiaGPU(index int) (string, error) {
if index < 0 {
return "", fmt.Errorf("gpu index must be >= 0")
}
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
if strings.TrimSpace(out) == "" && err == nil {
out = "GPU reset completed.\n"
}
return out, err
return resetNvidiaGPU(index)
}
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.