Add NVIDIA stress loader selection and DCGM 4 support
This commit is contained in:
@@ -136,7 +136,8 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-stress",
|
||||
"bee-gpu-burn",
|
||||
"bee-john-gpu-stress",
|
||||
})...)
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
@@ -176,8 +177,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
|
||||
Reference in New Issue
Block a user