Tighten support bundles and fix AMD runtime checks

This commit is contained in:
Mikhail Chusavitin
2026-03-25 19:35:25 +03:00
parent 30cf014d58
commit 9a1df9b1ba
12 changed files with 663 additions and 79 deletions

View File

@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
"smartctl",
"nvme",
"ipmitool",
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
"dhclient",
"mount",
}
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
}
}
for _, tool := range s.CheckTools(runtimeRequiredTools) {
vendor := s.DetectGPUVendor()
for _, tool := range s.runtimeToolStatuses(vendor) {
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
Name: tool.Name,
Path: tool.Path,
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
})
}
lsmodText := commandText("lsmod")
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
health.CUDAReady = false
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
s.collectGPURuntimeHealth(vendor, &health)
if health.Status != "FAILED" && len(health.Issues) > 0 {
health.Status = "PARTIAL"
@@ -162,3 +128,87 @@ func commandText(name string, args ...string) string {
}
return string(raw)
}
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
tools := s.CheckTools(runtimeRequiredTools)
switch vendor {
case "nvidia":
tools = append(tools, s.CheckTools([]string{
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
})...)
case "amd":
tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
tool.Path = cmd[0]
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
tool.Path = cmd[1]
}
tool.OK = true
}
tools = append(tools, tool)
}
return tools
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod")
switch vendor {
case "nvidia":
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
case "amd":
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "amdgpu_kernel_module_missing",
Severity: "warning",
Description: "AMD GPU driver is not loaded.",
})
}
out, err := runROCmSMI("--showproductname", "--csv")
if err == nil && strings.TrimSpace(string(out)) != "" {
health.CUDAReady = true
health.DriverReady = true
return
}
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "rocm_smi_unavailable",
Severity: "warning",
Description: "ROCm SMI is not available for AMD GPU SAT.",
})
}
}