Refine burn UI and NVIDIA stress flows

This commit is contained in:
2026-04-05 13:43:43 +03:00
parent 25af2df23a
commit 38e79143eb
18 changed files with 825 additions and 229 deletions

View File

@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
case "nvidia":
tools = append(tools, s.CheckTools([]string{
"nvidia-smi",
"dcgmi",
"nv-hostengine",
"nvidia-bug-report.sh",
"bee-gpu-burn",
"bee-john-gpu-stress",
"bee-nccl-gpu-stress",
"all_reduce_perf",
})...)
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
case "amd":
tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,6 +158,16 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
return tools
}
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
for _, candidate := range candidates {
path, err := exec.LookPath(candidate)
if err == nil {
return ToolStatus{Name: display, Path: path, OK: true}
}
}
return ToolStatus{Name: display}
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod")