Add NVIDIA self-heal tools and per-GPU SAT status

This commit is contained in:
Mikhail Chusavitin
2026-04-07 20:20:05 +03:00
parent 93cfa78e8c
commit 531d1ca366
13 changed files with 863 additions and 14 deletions

View File

@@ -88,6 +88,37 @@ type NvidiaGPU struct {
MemoryMB int `json:"memory_mb"`
}
type NvidiaGPUStatus struct {
Index int `json:"index"`
Name string `json:"name"`
BDF string `json:"bdf,omitempty"`
Serial string `json:"serial,omitempty"`
Status string `json:"status"`
RawLine string `json:"raw_line,omitempty"`
NeedsReset bool `json:"needs_reset"`
ParseFailure bool `json:"parse_failure,omitempty"`
}
type nvidiaGPUHealth struct {
Index int
Name string
NeedsReset bool
RawLine string
ParseFailure bool
}
type nvidiaGPUStatusFile struct {
Index int
Name string
RunStatus string
Reason string
Health string
HealthRaw string
Observed bool
Selected bool
FailingJob string
}
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
type AMDGPUInfo struct {
Index int `json:"index"`
@@ -269,6 +300,72 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
return gpus, nil
}
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
out, err := satExecCommand(
"nvidia-smi",
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return nil, fmt.Errorf("nvidia-smi: %w", err)
}
var gpus []NvidiaGPUStatus
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
parts := strings.Split(line, ",")
if len(parts) < 4 {
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
continue
}
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
if err != nil {
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
continue
}
upper := strings.ToUpper(line)
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
status := "OK"
if needsReset {
status = "RESET_REQUIRED"
}
gpus = append(gpus, NvidiaGPUStatus{
Index: idx,
Name: strings.TrimSpace(parts[1]),
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
Serial: strings.TrimSpace(parts[3]),
Status: status,
RawLine: line,
NeedsReset: needsReset,
})
}
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
return gpus, nil
}
func normalizeNvidiaBusID(v string) string {
v = strings.TrimSpace(strings.ToLower(v))
parts := strings.Split(v, ":")
if len(parts) == 3 && len(parts[0]) > 4 {
parts[0] = parts[0][len(parts[0])-4:]
return strings.Join(parts, ":")
}
return v
}
func (s *System) ResetNvidiaGPU(index int) (string, error) {
if index < 0 {
return "", fmt.Errorf("gpu index must be >= 0")
}
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
if len(raw) == 0 && err == nil {
raw = []byte("GPU reset completed.\n")
}
return string(raw), err
}
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
@@ -604,7 +701,7 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
)
}
@@ -652,11 +749,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
var summary strings.Builder
stats := satStats{}
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
perGPU := map[int]*nvidiaGPUStatusFile{}
selectedGPUIndices := map[int]struct{}{}
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
for _, job := range jobs {
if ctx.Err() != nil {
break
}
for _, idx := range job.gpuIndices {
selectedGPUIndices[idx] = struct{}{}
status := perGPU[idx]
if status == nil {
status = &nvidiaGPUStatusFile{Index: idx}
perGPU[idx] = status
}
status.Selected = true
}
cmd := make([]string, 0, len(job.cmd))
for _, arg := range job.cmd {
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
@@ -665,10 +774,37 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
var out []byte
var err error
if job.collectGPU {
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
} else {
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
if logFunc != nil {
logFunc(msg)
}
out = []byte(msg + "\n")
err = healthErr
}
}
if err == nil {
if job.collectGPU {
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
} else {
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
}
}
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
if logFunc != nil {
logFunc(msg)
}
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
out = append(out, '\n')
}
out = append(out, []byte(msg+"\n")...)
if err == nil {
err = healthErr
}
}
}
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
@@ -679,6 +815,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
}
status, rc := classifySATResult(job.name, out, err)
stats.Add(status)
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
for _, idx := range job.gpuIndices {
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
}
}
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
@@ -687,6 +828,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
return "", err
}
if nvidiaPack {
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
return "", err
}
}
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
@@ -695,6 +841,197 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
return archive, nil
}
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
entry := perGPU[idx]
if entry == nil {
entry = &nvidiaGPUStatusFile{Index: idx}
perGPU[idx] = entry
}
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
entry.RunStatus = status
entry.FailingJob = jobName
entry.Reason = firstLine(detail)
}
}
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
health, err := readNvidiaGPUHealth()
if err == nil {
for _, gpu := range health {
entry := perGPU[gpu.Index]
if entry == nil {
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
perGPU[gpu.Index] = entry
}
entry.Name = gpu.Name
entry.Observed = true
entry.HealthRaw = gpu.RawLine
if gpu.NeedsReset {
entry.Health = "RESET_REQUIRED"
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
entry.RunStatus = "FAILED"
if strings.TrimSpace(entry.Reason) == "" {
entry.Reason = "GPU requires reset"
}
}
} else {
entry.Health = "OK"
}
}
}
for idx := range selected {
entry := perGPU[idx]
if entry == nil {
entry = &nvidiaGPUStatusFile{Index: idx}
perGPU[idx] = entry
}
entry.Selected = true
}
var indices []int
for idx := range perGPU {
indices = append(indices, idx)
}
sort.Ints(indices)
for _, idx := range indices {
entry := perGPU[idx]
if entry.RunStatus == "" {
entry.RunStatus = overall
}
if entry.Health == "" {
entry.Health = "UNKNOWN"
}
if entry.Name == "" {
entry.Name = "unknown"
}
var body strings.Builder
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
if strings.TrimSpace(entry.FailingJob) != "" {
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
}
if strings.TrimSpace(entry.Reason) != "" {
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
}
if strings.TrimSpace(entry.HealthRaw) != "" {
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
}
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
return err
}
}
return nil
}
func nvidiaSATStatusSeverity(status string) int {
switch strings.ToUpper(strings.TrimSpace(status)) {
case "FAILED":
return 3
case "PARTIAL", "UNSUPPORTED":
return 2
case "OK":
return 1
default:
return 0
}
}
func firstLine(s string) string {
s = strings.TrimSpace(s)
if s == "" {
return ""
}
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
return strings.TrimSpace(s[:idx])
}
return s
}
func nvidiaJobNeedsHealthCheck(job satJob) bool {
if job.collectGPU {
return true
}
name := strings.ToLower(strings.TrimSpace(job.name))
return strings.Contains(name, "dcgmi") ||
strings.Contains(name, "gpu-burn") ||
strings.Contains(name, "gpu-stress") ||
strings.Contains(name, "dcgmproftester")
}
func checkNvidiaJobHealth(selected []int) (string, error) {
health, err := readNvidiaGPUHealth()
if err != nil {
return "", nil
}
var bad []nvidiaGPUHealth
selectedSet := make(map[int]struct{}, len(selected))
for _, idx := range selected {
selectedSet[idx] = struct{}{}
}
for _, gpu := range health {
if len(selectedSet) > 0 {
if _, ok := selectedSet[gpu.Index]; !ok {
continue
}
}
if gpu.NeedsReset {
bad = append(bad, gpu)
}
}
if len(bad) == 0 {
return "", nil
}
lines := make([]string, 0, len(bad)+1)
lines = append(lines, "NVIDIA GPU health check failed:")
for _, gpu := range bad {
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
}
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
}
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
out, err := satExecCommand(
"nvidia-smi",
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return nil, fmt.Errorf("nvidia-smi: %w", err)
}
return parseNvidiaGPUHealth(string(out)), nil
}
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
var gpus []nvidiaGPUHealth
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
parts := strings.Split(line, ",")
if len(parts) < 2 {
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
continue
}
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
if err != nil {
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
continue
}
upper := strings.ToUpper(line)
gpus = append(gpus, nvidiaGPUHealth{
Index: idx,
Name: strings.TrimSpace(parts[1]),
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
RawLine: line,
})
}
return gpus
}
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
@@ -818,6 +1155,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
// nvidia-smi on a machine with no NVIDIA GPU
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
strings.Contains(text, "no nvidia gpu") ||
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
// while waiting, so the CLI stops polling without proving device failure.
(strings.Contains(name, "self-test") &&
strings.Contains(text, "no progress for") &&
strings.Contains(text, "stop waiting")) ||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
return "UNSUPPORTED", rc
}

View File

@@ -216,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
}
}
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
t.Parallel()
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
if len(got) != 2 {
t.Fatalf("len=%d want 2", len(got))
}
if got[0].NeedsReset {
t.Fatalf("gpu0 unexpectedly marked reset-required")
}
if !got[1].NeedsReset {
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
}
}
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
oldExecCommand := satExecCommand
satExecCommand = func(name string, args ...string) *exec.Cmd {
if name == "nvidia-smi" {
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
}
return exec.Command(name, args...)
}
t.Cleanup(func() { satExecCommand = oldExecCommand })
msg, err := checkNvidiaJobHealth([]int{1})
if err == nil {
t.Fatal("expected health check error")
}
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
t.Fatalf("unexpected message: %q", msg)
}
}
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
dir := t.TempDir()
oldExecCommand := satExecCommand
satExecCommand = func(name string, args ...string) *exec.Cmd {
if name == "nvidia-smi" {
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
}
return exec.Command(name, args...)
}
t.Cleanup(func() { satExecCommand = oldExecCommand })
perGPU := map[int]*nvidiaGPUStatusFile{
0: {Index: 0, RunStatus: "OK"},
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
}
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
}
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
if err != nil {
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
}
text := string(raw)
if !strings.Contains(text, "run_status=FAILED") {
t.Fatalf("missing run status:\n%s", text)
}
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
t.Fatalf("missing health status:\n%s", text)
}
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
t.Fatalf("missing failing job:\n%s", text)
}
}
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
@@ -341,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
}{
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
}