Tighten support bundles and fix AMD runtime checks

This commit is contained in:
Mikhail Chusavitin
2026-03-25 19:35:25 +03:00
parent 30cf014d58
commit 9a1df9b1ba
12 changed files with 663 additions and 79 deletions

View File

@@ -173,11 +173,20 @@ func (a *App) RuntimeHealthResult() ActionResult {
if err != nil { if err != nil {
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."} return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
} }
driverLabel := "Driver ready"
accelLabel := "CUDA ready"
switch a.sat.DetectGPUVendor() {
case "amd":
driverLabel = "AMDGPU ready"
accelLabel = "ROCm SMI ready"
case "nvidia":
driverLabel = "NVIDIA ready"
}
var body strings.Builder var body strings.Builder
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN")) fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir)) fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
fmt.Fprintf(&body, "Driver ready: %t\n", health.DriverReady) fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
fmt.Fprintf(&body, "CUDA ready: %t\n", health.CUDAReady) fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN")) fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
if len(health.Issues) > 0 { if len(health.Issues) > 0 {
body.WriteString("\n\nIssues:\n") body.WriteString("\n\nIssues:\n")
@@ -238,9 +247,9 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) { func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportSupportBundle(target) path, err := a.ExportSupportBundle(target)
body := "Support bundle exported." body := "Support bundle exported. USB target unmounted and safe to remove."
if path != "" { if path != "" {
body = "Support bundle exported to " + path body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
} }
return ActionResult{Title: "Export support bundle", Body: body}, err return ActionResult{Title: "Export support bundle", Body: body}, err
} }

View File

@@ -1,9 +1,12 @@
package app package app
import ( import (
"archive/tar"
"compress/gzip"
"context" "context"
"encoding/json" "encoding/json"
"errors" "errors"
"io"
"os" "os"
"path/filepath" "path/filepath"
"testing" "testing"
@@ -57,13 +60,22 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
return f.serviceDoFn(name, action) return f.serviceDoFn(name, action)
} }
type fakeExports struct{} type fakeExports struct {
listTargetsFn func() ([]platform.RemovableTarget, error)
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
}
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) { func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
if f.listTargetsFn != nil {
return f.listTargetsFn()
}
return nil, nil return nil, nil
} }
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) { func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
if f.exportToTargetFn != nil {
return f.exportToTargetFn(src, target)
}
return "", nil return "", nil
} }
@@ -97,10 +109,14 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
} }
type fakeSAT struct { type fakeSAT struct {
runNvidiaFn func(string) (string, error) runNvidiaFn func(string) (string, error)
runMemoryFn func(string) (string, error) runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error) runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error) runCPUFn func(string, int) (string, error)
detectVendorFn func() string
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
} }
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) { func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
@@ -112,6 +128,9 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
} }
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) { func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
if f.listNvidiaGPUsFn != nil {
return f.listNvidiaGPUsFn()
}
return nil, nil return nil, nil
} }
@@ -130,11 +149,26 @@ func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string,
return "", nil return "", nil
} }
func (f fakeSAT) DetectGPUVendor() string { return "" } func (f fakeSAT) DetectGPUVendor() string {
if f.detectVendorFn != nil {
return f.detectVendorFn()
}
return ""
}
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil } func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
if f.listAMDGPUsFn != nil {
return f.listAMDGPUsFn()
}
return nil, nil
}
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil } func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
if f.runAMDPackFn != nil {
return f.runAMDPackFn(baseDir)
}
return "", nil
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel() t.Parallel()
@@ -394,6 +428,44 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
} }
} }
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
oldExportDir := DefaultExportDir
DefaultExportDir = tmp
t.Cleanup(func() { DefaultExportDir = oldExportDir })
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
t.Fatalf("write bee-audit.json: %v", err)
}
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
t.Fatalf("write bee-audit.log: %v", err)
}
a := &App{
exports: fakeExports{
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
if filepath.Base(src) == "" {
t.Fatalf("expected non-empty source path")
}
return "/media/bee/" + filepath.Base(src), nil
},
},
}
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
if err != nil {
t.Fatalf("ExportSupportBundleResult error: %v", err)
}
if result.Title != "Export support bundle" {
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
}
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
}
}
func TestRunNvidiaAcceptancePackResult(t *testing.T) { func TestRunNvidiaAcceptancePackResult(t *testing.T) {
t.Parallel() t.Parallel()
@@ -516,6 +588,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil { if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
t.Fatal(err)
}
archive, err := BuildSupportBundle(exportDir) archive, err := BuildSupportBundle(exportDir)
if err != nil { if err != nil {
@@ -524,6 +599,44 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if _, err := os.Stat(archive); err != nil { if _, err := os.Stat(archive); err != nil {
t.Fatalf("archive stat: %v", err) t.Fatalf("archive stat: %v", err)
} }
file, err := os.Open(archive)
if err != nil {
t.Fatalf("open archive: %v", err)
}
defer file.Close()
gzr, err := gzip.NewReader(file)
if err != nil {
t.Fatalf("gzip reader: %v", err)
}
defer gzr.Close()
tr := tar.NewReader(gzr)
var names []string
for {
hdr, err := tr.Next()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
t.Fatalf("read tar entry: %v", err)
}
names = append(names, hdr.Name)
}
var foundRaw bool
for _, name := range names {
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
foundRaw = true
}
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
}
}
if !foundRaw {
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
}
} }
func TestMainBanner(t *testing.T) { func TestMainBanner(t *testing.T) {
@@ -600,6 +713,44 @@ func TestMainBanner(t *testing.T) {
} }
} }
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
tmp := t.TempDir()
oldRuntimePath := DefaultRuntimeJSONPath
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
raw, err := json.Marshal(schema.RuntimeHealth{
Status: "OK",
ExportDir: "/appdata/bee/export",
DriverReady: true,
CUDAReady: true,
NetworkStatus: "OK",
})
if err != nil {
t.Fatalf("marshal runtime health: %v", err)
}
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
t.Fatalf("write runtime health: %v", err)
}
a := &App{
sat: fakeSAT{
detectVendorFn: func() string { return "amd" },
},
}
result := a.RuntimeHealthResult()
if !contains(result.Body, "AMDGPU ready: true") {
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
}
if !contains(result.Body, "ROCm SMI ready: true") {
t.Fatalf("body missing ROCm label:\n%s", result.Body)
}
if contains(result.Body, "CUDA ready") {
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
}
}
func intPtr(v int) *int { return &v } func intPtr(v int) *int { return &v }
func contains(haystack, needle string) bool { func contains(haystack, needle string) bool {

View File

@@ -56,7 +56,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
} }
defer os.RemoveAll(stageRoot) defer os.RemoveAll(stageRoot)
if err := copyDirContents(exportDir, filepath.Join(stageRoot, "export")); err != nil { if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
return "", err return "", err
} }
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil { if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
@@ -214,6 +214,40 @@ func copyDirContents(srcDir, dstDir string) error {
return nil return nil
} }
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
if cleanRel == "" {
return true
}
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
return false
}
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
return false
}
return true
})
}
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
entries, err := os.ReadDir(srcDir)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
for _, entry := range entries {
src := filepath.Join(srcDir, entry.Name())
dst := filepath.Join(dstDir, entry.Name())
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
return err
}
}
return nil
}
func copyPath(src, dst string) error { func copyPath(src, dst string) error {
info, err := os.Stat(src) info, err := os.Stat(src)
if err != nil { if err != nil {
@@ -254,6 +288,36 @@ func copyPath(src, dst string) error {
return err return err
} }
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
info, err := os.Stat(src)
if err != nil {
return err
}
rel, err := filepath.Rel(rootSrc, src)
if err != nil {
return err
}
if keep != nil && !keep(rel, info) {
return nil
}
if info.IsDir() {
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
return err
}
entries, err := os.ReadDir(src)
if err != nil {
return err
}
for _, entry := range entries {
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
return err
}
}
return nil
}
return copyPath(src, dst)
}
func createSupportTarGz(dst, srcDir string) error { func createSupportTarGz(dst, srcDir string) error {
file, err := os.Create(dst) file, err := os.Create(dst)
if err != nil { if err != nil {

View File

@@ -9,8 +9,10 @@ import (
"strings" "strings"
) )
var exportExecCommand = exec.Command
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) { func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output() raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -52,7 +54,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
return out, nil return out, nil
} }
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) { func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
if src == "" || target.Device == "" { if src == "" || target.Device == "" {
return "", fmt.Errorf("source and target are required") return "", fmt.Errorf("source and target are required")
} }
@@ -62,20 +64,39 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
mountpoint := strings.TrimSpace(target.Mountpoint) mountpoint := strings.TrimSpace(target.Mountpoint)
mountedHere := false mountedHere := false
mounted := mountpoint != ""
if mountpoint == "" { if mountpoint == "" {
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device)) mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
if err := os.MkdirAll(mountpoint, 0755); err != nil { if err := os.MkdirAll(mountpoint, 0755); err != nil {
return "", err return "", err
} }
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil { if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
_ = os.Remove(mountpoint) _ = os.Remove(mountpoint)
return string(raw), err return string(raw), err
} }
mountedHere = true mountedHere = true
mounted = true
} }
defer func() {
if !mounted {
return
}
_ = exportExecCommand("sync").Run()
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
msg := strings.TrimSpace(string(raw))
if msg == "" {
retErr = err
} else {
retErr = fmt.Errorf("%s: %w", msg, err)
}
}
if mountedHere {
_ = os.Remove(mountpoint)
}
}()
filename := filepath.Base(src) filename := filepath.Base(src)
dst := filepath.Join(mountpoint, filename) dst = filepath.Join(mountpoint, filename)
data, err := os.ReadFile(src) data, err := os.ReadFile(src)
if err != nil { if err != nil {
return "", err return "", err
@@ -83,12 +104,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
if err := os.WriteFile(dst, data, 0644); err != nil { if err := os.WriteFile(dst, data, 0644); err != nil {
return "", err return "", err
} }
_ = exec.Command("sync").Run()
if mountedHere {
_ = exec.Command("umount", mountpoint).Run()
_ = os.Remove(mountpoint)
}
return dst, nil return dst, nil
} }

View File

@@ -0,0 +1,56 @@
package platform
import (
"os"
"os/exec"
"path/filepath"
"testing"
)
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt")
if err := os.MkdirAll(mountpoint, 0755); err != nil {
t.Fatalf("mkdir mountpoint: %v", err)
}
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
t.Fatalf("write src: %v", err)
}
var calls [][]string
oldExec := exportExecCommand
exportExecCommand = func(name string, args ...string) *exec.Cmd {
calls = append(calls, append([]string{name}, args...))
return exec.Command("sh", "-c", "exit 0")
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
dst, err := s.ExportFileToTarget(src, RemovableTarget{
Device: "/dev/sdb1",
Mountpoint: mountpoint,
})
if err != nil {
t.Fatalf("ExportFileToTarget error: %v", err)
}
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
t.Fatalf("dst=%q want %q", got, want)
}
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
t.Fatalf("exported file missing: %v", err)
}
foundUmount := false
for _, call := range calls {
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
foundUmount = true
break
}
}
if !foundUmount {
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
}
}

View File

@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
"smartctl", "smartctl",
"nvme", "nvme",
"ipmitool", "ipmitool",
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
"dhclient", "dhclient",
"mount", "mount",
} }
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
} }
} }
for _, tool := range s.CheckTools(runtimeRequiredTools) { vendor := s.DetectGPUVendor()
for _, tool := range s.runtimeToolStatuses(vendor) {
health.Tools = append(health.Tools, schema.RuntimeToolStatus{ health.Tools = append(health.Tools, schema.RuntimeToolStatus{
Name: tool.Name, Name: tool.Name,
Path: tool.Path, Path: tool.Path,
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
}) })
} }
lsmodText := commandText("lsmod") s.collectGPURuntimeHealth(vendor, &health)
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
health.CUDAReady = false
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
if health.Status != "FAILED" && len(health.Issues) > 0 { if health.Status != "FAILED" && len(health.Issues) > 0 {
health.Status = "PARTIAL" health.Status = "PARTIAL"
@@ -162,3 +128,87 @@ func commandText(name string, args ...string) string {
} }
return string(raw) return string(raw)
} }
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
tools := s.CheckTools(runtimeRequiredTools)
switch vendor {
case "nvidia":
tools = append(tools, s.CheckTools([]string{
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
})...)
case "amd":
tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
tool.Path = cmd[0]
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
tool.Path = cmd[1]
}
tool.OK = true
}
tools = append(tools, tool)
}
return tools
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod")
switch vendor {
case "nvidia":
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
case "amd":
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "amdgpu_kernel_module_missing",
Severity: "warning",
Description: "AMD GPU driver is not loaded.",
})
}
out, err := runROCmSMI("--showproductname", "--csv")
if err == nil && strings.TrimSpace(string(out)) != "" {
health.CUDAReady = true
health.DriverReady = true
return
}
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "rocm_smi_unavailable",
Severity: "warning",
Description: "ROCm SMI is not available for AMD GPU SAT.",
})
}
}

View File

@@ -4,6 +4,7 @@ import (
"archive/tar" "archive/tar"
"compress/gzip" "compress/gzip"
"context" "context"
"errors"
"fmt" "fmt"
"io" "io"
"os" "os"
@@ -15,6 +16,22 @@ import (
"time" "time"
) )
var (
satExecCommand = exec.Command
satLookPath = exec.LookPath
satGlob = filepath.Glob
satStat = os.Stat
rocmSMIExecutableGlobs = []string{
"/opt/rocm/bin/rocm-smi",
"/opt/rocm-*/bin/rocm-smi",
}
rocmSMIScriptGlobs = []string{
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
}
)
// NvidiaGPU holds basic GPU info from nvidia-smi. // NvidiaGPU holds basic GPU info from nvidia-smi.
type NvidiaGPU struct { type NvidiaGPU struct {
Index int Index int
@@ -41,7 +58,7 @@ func (s *System) DetectGPUVendor() string {
// ListAMDGPUs returns AMD GPUs visible to rocm-smi. // ListAMDGPUs returns AMD GPUs visible to rocm-smi.
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) { func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output() out, err := runROCmSMI("--showproductname", "--csv")
if err != nil { if err != nil {
return nil, fmt.Errorf("rocm-smi: %w", err) return nil, fmt.Errorf("rocm-smi: %w", err)
} }
@@ -337,12 +354,22 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) { func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
start := time.Now().UTC() start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog, appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name), fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(cmd, " "), "cmd: "+strings.Join(resolvedCmd, " "),
) )
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
"rc: 1",
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
"",
)
return []byte(err.Error() + "\n"), err
}
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...) c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
if len(env) > 0 { if len(env) > 0 {
c.Env = append(os.Environ(), env...) c.Env = append(os.Environ(), env...)
} }
@@ -362,19 +389,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
} }
func listStorageDevices() ([]string, error) { func listStorageDevices() ([]string, error) {
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output() out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
if err != nil { if err != nil {
return nil, err return nil, err
} }
var devices []string return parseStorageDevices(string(out)), nil
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) != 2 || fields[1] != "disk" {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices, nil
} }
func storageSATCommands(devPath string) []satJob { func storageSATCommands(devPath string) []satJob {
@@ -445,12 +464,22 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) { func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
start := time.Now().UTC() start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog, appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name), fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(cmd, " "), "cmd: "+strings.Join(resolvedCmd, " "),
) )
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
"rc: 1",
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
"",
)
return []byte(err.Error() + "\n"), err
}
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput() out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
rc := 0 rc := 0
if err != nil { if err != nil {
@@ -465,6 +494,91 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
return out, err return out, err
} }
func runROCmSMI(args ...string) ([]byte, error) {
cmd, err := resolveROCmSMICommand(args...)
if err != nil {
return nil, err
}
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
}
func resolveSATCommand(cmd []string) ([]string, error) {
if len(cmd) == 0 {
return nil, errors.New("empty SAT command")
}
if cmd[0] != "rocm-smi" {
return cmd, nil
}
return resolveROCmSMICommand(cmd[1:]...)
}
func resolveROCmSMICommand(args ...string) ([]string, error) {
if path, err := satLookPath("rocm-smi"); err == nil {
return append([]string{path}, args...), nil
}
for _, path := range rocmSMIExecutableCandidates() {
return append([]string{path}, args...), nil
}
pythonPath, pyErr := satLookPath("python3")
if pyErr == nil {
for _, script := range rocmSMIScriptCandidates() {
cmd := []string{pythonPath, script}
cmd = append(cmd, args...)
return cmd, nil
}
}
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
}
func rocmSMIExecutableCandidates() []string {
return expandExistingPaths(rocmSMIExecutableGlobs)
}
func rocmSMIScriptCandidates() []string {
return expandExistingPaths(rocmSMIScriptGlobs)
}
func expandExistingPaths(patterns []string) []string {
seen := make(map[string]struct{})
var paths []string
for _, pattern := range patterns {
matches, err := satGlob(pattern)
if err != nil {
continue
}
sort.Strings(matches)
for _, match := range matches {
if _, err := satStat(match); err != nil {
continue
}
if _, ok := seen[match]; ok {
continue
}
seen[match] = struct{}{}
paths = append(paths, match)
}
}
return paths
}
func parseStorageDevices(raw string) []string {
var devices []string
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) < 2 || fields[1] != "disk" {
continue
}
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices
}
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background. // runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir. // On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) { func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {

View File

@@ -3,6 +3,8 @@ package platform
import ( import (
"errors" "errors"
"os" "os"
"os/exec"
"path/filepath"
"testing" "testing"
) )
@@ -91,3 +93,90 @@ func TestClassifySATResult(t *testing.T) {
}) })
} }
} }
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
t.Parallel()
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
got := parseStorageDevices(raw)
want := []string{"/dev/nvme0n1", "/dev/sdb"}
if len(got) != len(want) {
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
}
}
}
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
t.Setenv("PATH", t.TempDir())
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
t.Fatalf("write rocm-smi: %v", err)
}
cmd, err := resolveROCmSMICommand("--showproductname")
if err != nil {
t.Fatalf("resolveROCmSMICommand error: %v", err)
}
if len(cmd) != 2 {
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
}
if cmd[0] != toolPath {
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
}
}
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
tmp := t.TempDir()
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
t.Fatalf("mkdir: %v", err)
}
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
t.Fatalf("write rocm-smi: %v", err)
}
oldGlob := rocmSMIExecutableGlobs
oldScriptGlobs := rocmSMIScriptGlobs
rocmSMIExecutableGlobs = []string{execPath}
rocmSMIScriptGlobs = nil
t.Cleanup(func() {
rocmSMIExecutableGlobs = oldGlob
rocmSMIScriptGlobs = oldScriptGlobs
})
t.Setenv("PATH", "")
cmd, err := resolveROCmSMICommand("--showallinfo")
if err != nil {
t.Fatalf("resolveROCmSMICommand error: %v", err)
}
if len(cmd) != 2 {
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
}
if cmd[0] != execPath {
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
}
}
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
oldLookPath := satLookPath
oldExecGlobs := rocmSMIExecutableGlobs
oldScriptGlobs := rocmSMIScriptGlobs
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
rocmSMIExecutableGlobs = nil
rocmSMIScriptGlobs = nil
t.Cleanup(func() {
satLookPath = oldLookPath
rocmSMIExecutableGlobs = oldExecGlobs
rocmSMIScriptGlobs = oldScriptGlobs
})
if _, err := runROCmSMI("--showproductname"); err == nil {
t.Fatal("expected missing rocm-smi error")
}
}

View File

@@ -24,15 +24,23 @@ var techDumpFixedCommands = []struct {
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"}, {Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"}, {Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"}, {Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
}
var techDumpNvidiaCommands = []struct {
Name string
Args []string
File string
}{
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"}, {Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"}, {Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
} }
type lsblkDumpRoot struct { type lsblkDumpRoot struct {
Blockdevices []struct { Blockdevices []struct {
Name string `json:"name"` Name string `json:"name"`
Type string `json:"type"` Type string `json:"type"`
Tran string `json:"tran"`
} `json:"blockdevices"` } `json:"blockdevices"`
} }
@@ -50,6 +58,15 @@ func (s *System) CaptureTechnicalDump(baseDir string) error {
for _, cmd := range techDumpFixedCommands { for _, cmd := range techDumpFixedCommands {
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...) writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
} }
switch s.DetectGPUVendor() {
case "nvidia":
for _, cmd := range techDumpNvidiaCommands {
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
}
case "amd":
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
}
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) { for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev) writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
@@ -69,6 +86,14 @@ func writeCommandDump(path, name string, args ...string) {
_ = os.WriteFile(path, out, 0644) _ = os.WriteFile(path, out, 0644)
} }
func writeROCmSMIDump(path string, args ...string) {
out, err := runROCmSMI(args...)
if err != nil && len(out) == 0 {
return
}
_ = os.WriteFile(path, out, 0644)
}
func lsblkDumpDevices(path string) []string { func lsblkDumpDevices(path string) []string {
raw, err := os.ReadFile(path) raw, err := os.ReadFile(path)
if err != nil { if err != nil {
@@ -80,6 +105,9 @@ func lsblkDumpDevices(path string) []string {
} }
var devices []string var devices []string
for _, dev := range root.Blockdevices { for _, dev := range root.Blockdevices {
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
continue
}
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" { if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
devices = append(devices, strings.TrimSpace(dev.Name)) devices = append(devices, strings.TrimSpace(dev.Name))
} }

View File

@@ -12,12 +12,12 @@ func TestLSBLKDumpDevices(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
path := filepath.Join(dir, "lsblk.json") path := filepath.Join(dir, "lsblk.json")
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk"}]}`), 0644); err != nil { if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
t.Fatalf("write lsblk fixture: %v", err) t.Fatalf("write lsblk fixture: %v", err)
} }
got := lsblkDumpDevices(path) got := lsblkDumpDevices(path)
want := []string{"nvme0n1", "sda"} want := []string{"nvme0n1", "sdb"}
if !reflect.DeepEqual(got, want) { if !reflect.DeepEqual(got, want) {
t.Fatalf("lsblkDumpDevices=%v want %v", got, want) t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
} }

View File

@@ -60,6 +60,14 @@ apt-get update -qq
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring # rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
echo "=== AMD ROCm: rocm-smi installed ===" echo "=== AMD ROCm: rocm-smi installed ==="
if [ -x /opt/rocm/bin/rocm-smi ]; then
ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
else
candidate="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
if [ -n "${candidate}" ]; then
ln -sf "${candidate}" /usr/local/bin/rocm-smi
fi
fi
rocm-smi --version 2>/dev/null || true rocm-smi --version 2>/dev/null || true
else else
echo "WARN: rocm-smi-lib install failed — GPU monitoring unavailable" echo "WARN: rocm-smi-lib install failed — GPU monitoring unavailable"

View File

@@ -1,4 +1,4 @@
export PATH="$PATH:/usr/local/bin" export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
menu() { menu() {
if [ -x /usr/local/bin/bee-tui ]; then if [ -x /usr/local/bin/bee-tui ]; then