Add NVIDIA self-heal tools and per-GPU SAT status
This commit is contained in:
@@ -216,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len=%d want 2", len(got))
|
||||
}
|
||||
if got[0].NeedsReset {
|
||||
t.Fatalf("gpu0 unexpectedly marked reset-required")
|
||||
}
|
||||
if !got[1].NeedsReset {
|
||||
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
msg, err := checkNvidiaJobHealth([]int{1})
|
||||
if err == nil {
|
||||
t.Fatal("expected health check error")
|
||||
}
|
||||
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
|
||||
t.Fatalf("unexpected message: %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
perGPU := map[int]*nvidiaGPUStatusFile{
|
||||
0: {Index: 0, RunStatus: "OK"},
|
||||
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
|
||||
}
|
||||
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
|
||||
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
if !strings.Contains(text, "run_status=FAILED") {
|
||||
t.Fatalf("missing run status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
|
||||
t.Fatalf("missing health status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
|
||||
t.Fatalf("missing failing job:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
@@ -341,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user