Tighten support bundles and fix AMD runtime checks

This commit is contained in:
Mikhail Chusavitin
2026-03-25 19:35:25 +03:00
parent 30cf014d58
commit 9a1df9b1ba
12 changed files with 663 additions and 79 deletions

View File

@@ -173,11 +173,20 @@ func (a *App) RuntimeHealthResult() ActionResult {
if err != nil {
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
}
driverLabel := "Driver ready"
accelLabel := "CUDA ready"
switch a.sat.DetectGPUVendor() {
case "amd":
driverLabel = "AMDGPU ready"
accelLabel = "ROCm SMI ready"
case "nvidia":
driverLabel = "NVIDIA ready"
}
var body strings.Builder
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
fmt.Fprintf(&body, "Driver ready: %t\n", health.DriverReady)
fmt.Fprintf(&body, "CUDA ready: %t\n", health.CUDAReady)
fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
if len(health.Issues) > 0 {
body.WriteString("\n\nIssues:\n")
@@ -238,9 +247,9 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportSupportBundle(target)
body := "Support bundle exported."
body := "Support bundle exported. USB target unmounted and safe to remove."
if path != "" {
body = "Support bundle exported to " + path
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
}
return ActionResult{Title: "Export support bundle", Body: body}, err
}

View File

@@ -1,9 +1,12 @@
package app
import (
"archive/tar"
"compress/gzip"
"context"
"encoding/json"
"errors"
"io"
"os"
"path/filepath"
"testing"
@@ -57,13 +60,22 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
return f.serviceDoFn(name, action)
}
type fakeExports struct{}
type fakeExports struct {
listTargetsFn func() ([]platform.RemovableTarget, error)
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
}
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
if f.listTargetsFn != nil {
return f.listTargetsFn()
}
return nil, nil
}
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
if f.exportToTargetFn != nil {
return f.exportToTargetFn(src, target)
}
return "", nil
}
@@ -97,10 +109,14 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
}
type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
runNvidiaFn func(string) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
detectVendorFn func() string
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
}
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
@@ -112,6 +128,9 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
}
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
if f.listNvidiaGPUsFn != nil {
return f.listNvidiaGPUsFn()
}
return nil, nil
}
@@ -130,11 +149,26 @@ func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string,
return "", nil
}
func (f fakeSAT) DetectGPUVendor() string { return "" }
func (f fakeSAT) DetectGPUVendor() string {
if f.detectVendorFn != nil {
return f.detectVendorFn()
}
return ""
}
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil }
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
if f.listAMDGPUsFn != nil {
return f.listAMDGPUsFn()
}
return nil, nil
}
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil }
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
if f.runAMDPackFn != nil {
return f.runAMDPackFn(baseDir)
}
return "", nil
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()
@@ -394,6 +428,44 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
}
}
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
oldExportDir := DefaultExportDir
DefaultExportDir = tmp
t.Cleanup(func() { DefaultExportDir = oldExportDir })
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
t.Fatalf("write bee-audit.json: %v", err)
}
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
t.Fatalf("write bee-audit.log: %v", err)
}
a := &App{
exports: fakeExports{
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
if filepath.Base(src) == "" {
t.Fatalf("expected non-empty source path")
}
return "/media/bee/" + filepath.Base(src), nil
},
},
}
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
if err != nil {
t.Fatalf("ExportSupportBundleResult error: %v", err)
}
if result.Title != "Export support bundle" {
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
}
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
}
}
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
t.Parallel()
@@ -516,6 +588,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
t.Fatal(err)
}
archive, err := BuildSupportBundle(exportDir)
if err != nil {
@@ -524,6 +599,44 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if _, err := os.Stat(archive); err != nil {
t.Fatalf("archive stat: %v", err)
}
file, err := os.Open(archive)
if err != nil {
t.Fatalf("open archive: %v", err)
}
defer file.Close()
gzr, err := gzip.NewReader(file)
if err != nil {
t.Fatalf("gzip reader: %v", err)
}
defer gzr.Close()
tr := tar.NewReader(gzr)
var names []string
for {
hdr, err := tr.Next()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
t.Fatalf("read tar entry: %v", err)
}
names = append(names, hdr.Name)
}
var foundRaw bool
for _, name := range names {
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
foundRaw = true
}
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
}
}
if !foundRaw {
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
}
}
func TestMainBanner(t *testing.T) {
@@ -600,6 +713,44 @@ func TestMainBanner(t *testing.T) {
}
}
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
tmp := t.TempDir()
oldRuntimePath := DefaultRuntimeJSONPath
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
raw, err := json.Marshal(schema.RuntimeHealth{
Status: "OK",
ExportDir: "/appdata/bee/export",
DriverReady: true,
CUDAReady: true,
NetworkStatus: "OK",
})
if err != nil {
t.Fatalf("marshal runtime health: %v", err)
}
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
t.Fatalf("write runtime health: %v", err)
}
a := &App{
sat: fakeSAT{
detectVendorFn: func() string { return "amd" },
},
}
result := a.RuntimeHealthResult()
if !contains(result.Body, "AMDGPU ready: true") {
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
}
if !contains(result.Body, "ROCm SMI ready: true") {
t.Fatalf("body missing ROCm label:\n%s", result.Body)
}
if contains(result.Body, "CUDA ready") {
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
}
}
func intPtr(v int) *int { return &v }
func contains(haystack, needle string) bool {

View File

@@ -56,7 +56,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
}
defer os.RemoveAll(stageRoot)
if err := copyDirContents(exportDir, filepath.Join(stageRoot, "export")); err != nil {
if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
return "", err
}
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
@@ -214,6 +214,40 @@ func copyDirContents(srcDir, dstDir string) error {
return nil
}
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
if cleanRel == "" {
return true
}
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
return false
}
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
return false
}
return true
})
}
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
entries, err := os.ReadDir(srcDir)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
for _, entry := range entries {
src := filepath.Join(srcDir, entry.Name())
dst := filepath.Join(dstDir, entry.Name())
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
return err
}
}
return nil
}
func copyPath(src, dst string) error {
info, err := os.Stat(src)
if err != nil {
@@ -254,6 +288,36 @@ func copyPath(src, dst string) error {
return err
}
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
info, err := os.Stat(src)
if err != nil {
return err
}
rel, err := filepath.Rel(rootSrc, src)
if err != nil {
return err
}
if keep != nil && !keep(rel, info) {
return nil
}
if info.IsDir() {
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
return err
}
entries, err := os.ReadDir(src)
if err != nil {
return err
}
for _, entry := range entries {
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
return err
}
}
return nil
}
return copyPath(src, dst)
}
func createSupportTarGz(dst, srcDir string) error {
file, err := os.Create(dst)
if err != nil {

View File

@@ -9,8 +9,10 @@ import (
"strings"
)
var exportExecCommand = exec.Command
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
if err != nil {
return nil, err
}
@@ -52,7 +54,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
return out, nil
}
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
if src == "" || target.Device == "" {
return "", fmt.Errorf("source and target are required")
}
@@ -62,20 +64,39 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
mountpoint := strings.TrimSpace(target.Mountpoint)
mountedHere := false
mounted := mountpoint != ""
if mountpoint == "" {
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
if err := os.MkdirAll(mountpoint, 0755); err != nil {
return "", err
}
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
_ = os.Remove(mountpoint)
return string(raw), err
}
mountedHere = true
mounted = true
}
defer func() {
if !mounted {
return
}
_ = exportExecCommand("sync").Run()
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
msg := strings.TrimSpace(string(raw))
if msg == "" {
retErr = err
} else {
retErr = fmt.Errorf("%s: %w", msg, err)
}
}
if mountedHere {
_ = os.Remove(mountpoint)
}
}()
filename := filepath.Base(src)
dst := filepath.Join(mountpoint, filename)
dst = filepath.Join(mountpoint, filename)
data, err := os.ReadFile(src)
if err != nil {
return "", err
@@ -83,12 +104,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
if err := os.WriteFile(dst, data, 0644); err != nil {
return "", err
}
_ = exec.Command("sync").Run()
if mountedHere {
_ = exec.Command("umount", mountpoint).Run()
_ = os.Remove(mountpoint)
}
return dst, nil
}

View File

@@ -0,0 +1,56 @@
package platform
import (
"os"
"os/exec"
"path/filepath"
"testing"
)
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt")
if err := os.MkdirAll(mountpoint, 0755); err != nil {
t.Fatalf("mkdir mountpoint: %v", err)
}
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
t.Fatalf("write src: %v", err)
}
var calls [][]string
oldExec := exportExecCommand
exportExecCommand = func(name string, args ...string) *exec.Cmd {
calls = append(calls, append([]string{name}, args...))
return exec.Command("sh", "-c", "exit 0")
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
dst, err := s.ExportFileToTarget(src, RemovableTarget{
Device: "/dev/sdb1",
Mountpoint: mountpoint,
})
if err != nil {
t.Fatalf("ExportFileToTarget error: %v", err)
}
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
t.Fatalf("dst=%q want %q", got, want)
}
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
t.Fatalf("exported file missing: %v", err)
}
foundUmount := false
for _, call := range calls {
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
foundUmount = true
break
}
}
if !foundUmount {
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
}
}

View File

@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
"smartctl",
"nvme",
"ipmitool",
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
"dhclient",
"mount",
}
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
}
}
for _, tool := range s.CheckTools(runtimeRequiredTools) {
vendor := s.DetectGPUVendor()
for _, tool := range s.runtimeToolStatuses(vendor) {
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
Name: tool.Name,
Path: tool.Path,
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
})
}
lsmodText := commandText("lsmod")
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
health.CUDAReady = false
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
s.collectGPURuntimeHealth(vendor, &health)
if health.Status != "FAILED" && len(health.Issues) > 0 {
health.Status = "PARTIAL"
@@ -162,3 +128,87 @@ func commandText(name string, args ...string) string {
}
return string(raw)
}
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
tools := s.CheckTools(runtimeRequiredTools)
switch vendor {
case "nvidia":
tools = append(tools, s.CheckTools([]string{
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
})...)
case "amd":
tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
tool.Path = cmd[0]
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
tool.Path = cmd[1]
}
tool.OK = true
}
tools = append(tools, tool)
}
return tools
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod")
switch vendor {
case "nvidia":
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
case "amd":
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "amdgpu_kernel_module_missing",
Severity: "warning",
Description: "AMD GPU driver is not loaded.",
})
}
out, err := runROCmSMI("--showproductname", "--csv")
if err == nil && strings.TrimSpace(string(out)) != "" {
health.CUDAReady = true
health.DriverReady = true
return
}
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "rocm_smi_unavailable",
Severity: "warning",
Description: "ROCm SMI is not available for AMD GPU SAT.",
})
}
}

View File

@@ -4,6 +4,7 @@ import (
"archive/tar"
"compress/gzip"
"context"
"errors"
"fmt"
"io"
"os"
@@ -15,6 +16,22 @@ import (
"time"
)
var (
satExecCommand = exec.Command
satLookPath = exec.LookPath
satGlob = filepath.Glob
satStat = os.Stat
rocmSMIExecutableGlobs = []string{
"/opt/rocm/bin/rocm-smi",
"/opt/rocm-*/bin/rocm-smi",
}
rocmSMIScriptGlobs = []string{
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
}
)
// NvidiaGPU holds basic GPU info from nvidia-smi.
type NvidiaGPU struct {
Index int
@@ -41,7 +58,7 @@ func (s *System) DetectGPUVendor() string {
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output()
out, err := runROCmSMI("--showproductname", "--csv")
if err != nil {
return nil, fmt.Errorf("rocm-smi: %w", err)
}
@@ -337,12 +354,22 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(cmd, " "),
"cmd: "+strings.Join(resolvedCmd, " "),
)
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
"rc: 1",
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
"",
)
return []byte(err.Error() + "\n"), err
}
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
if len(env) > 0 {
c.Env = append(os.Environ(), env...)
}
@@ -362,19 +389,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
}
func listStorageDevices() ([]string, error) {
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
if err != nil {
return nil, err
}
var devices []string
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) != 2 || fields[1] != "disk" {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices, nil
return parseStorageDevices(string(out)), nil
}
func storageSATCommands(devPath string) []satJob {
@@ -445,12 +464,22 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(cmd, " "),
"cmd: "+strings.Join(resolvedCmd, " "),
)
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
"rc: 1",
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
"",
)
return []byte(err.Error() + "\n"), err
}
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
rc := 0
if err != nil {
@@ -465,6 +494,91 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
return out, err
}
func runROCmSMI(args ...string) ([]byte, error) {
cmd, err := resolveROCmSMICommand(args...)
if err != nil {
return nil, err
}
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
}
func resolveSATCommand(cmd []string) ([]string, error) {
if len(cmd) == 0 {
return nil, errors.New("empty SAT command")
}
if cmd[0] != "rocm-smi" {
return cmd, nil
}
return resolveROCmSMICommand(cmd[1:]...)
}
func resolveROCmSMICommand(args ...string) ([]string, error) {
if path, err := satLookPath("rocm-smi"); err == nil {
return append([]string{path}, args...), nil
}
for _, path := range rocmSMIExecutableCandidates() {
return append([]string{path}, args...), nil
}
pythonPath, pyErr := satLookPath("python3")
if pyErr == nil {
for _, script := range rocmSMIScriptCandidates() {
cmd := []string{pythonPath, script}
cmd = append(cmd, args...)
return cmd, nil
}
}
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
}
func rocmSMIExecutableCandidates() []string {
return expandExistingPaths(rocmSMIExecutableGlobs)
}
func rocmSMIScriptCandidates() []string {
return expandExistingPaths(rocmSMIScriptGlobs)
}
func expandExistingPaths(patterns []string) []string {
seen := make(map[string]struct{})
var paths []string
for _, pattern := range patterns {
matches, err := satGlob(pattern)
if err != nil {
continue
}
sort.Strings(matches)
for _, match := range matches {
if _, err := satStat(match); err != nil {
continue
}
if _, ok := seen[match]; ok {
continue
}
seen[match] = struct{}{}
paths = append(paths, match)
}
}
return paths
}
func parseStorageDevices(raw string) []string {
var devices []string
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) < 2 || fields[1] != "disk" {
continue
}
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices
}
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {

View File

@@ -3,6 +3,8 @@ package platform
import (
"errors"
"os"
"os/exec"
"path/filepath"
"testing"
)
@@ -91,3 +93,90 @@ func TestClassifySATResult(t *testing.T) {
})
}
}
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
t.Parallel()
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
got := parseStorageDevices(raw)
want := []string{"/dev/nvme0n1", "/dev/sdb"}
if len(got) != len(want) {
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
}
}
}
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
t.Setenv("PATH", t.TempDir())
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
t.Fatalf("write rocm-smi: %v", err)
}
cmd, err := resolveROCmSMICommand("--showproductname")
if err != nil {
t.Fatalf("resolveROCmSMICommand error: %v", err)
}
if len(cmd) != 2 {
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
}
if cmd[0] != toolPath {
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
}
}
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
tmp := t.TempDir()
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
t.Fatalf("mkdir: %v", err)
}
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
t.Fatalf("write rocm-smi: %v", err)
}
oldGlob := rocmSMIExecutableGlobs
oldScriptGlobs := rocmSMIScriptGlobs
rocmSMIExecutableGlobs = []string{execPath}
rocmSMIScriptGlobs = nil
t.Cleanup(func() {
rocmSMIExecutableGlobs = oldGlob
rocmSMIScriptGlobs = oldScriptGlobs
})
t.Setenv("PATH", "")
cmd, err := resolveROCmSMICommand("--showallinfo")
if err != nil {
t.Fatalf("resolveROCmSMICommand error: %v", err)
}
if len(cmd) != 2 {
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
}
if cmd[0] != execPath {
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
}
}
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
oldLookPath := satLookPath
oldExecGlobs := rocmSMIExecutableGlobs
oldScriptGlobs := rocmSMIScriptGlobs
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
rocmSMIExecutableGlobs = nil
rocmSMIScriptGlobs = nil
t.Cleanup(func() {
satLookPath = oldLookPath
rocmSMIExecutableGlobs = oldExecGlobs
rocmSMIScriptGlobs = oldScriptGlobs
})
if _, err := runROCmSMI("--showproductname"); err == nil {
t.Fatal("expected missing rocm-smi error")
}
}

View File

@@ -24,15 +24,23 @@ var techDumpFixedCommands = []struct {
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
}
var techDumpNvidiaCommands = []struct {
Name string
Args []string
File string
}{
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
}
type lsblkDumpRoot struct {
Blockdevices []struct {
Name string `json:"name"`
Type string `json:"type"`
Tran string `json:"tran"`
} `json:"blockdevices"`
}
@@ -50,6 +58,15 @@ func (s *System) CaptureTechnicalDump(baseDir string) error {
for _, cmd := range techDumpFixedCommands {
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
}
switch s.DetectGPUVendor() {
case "nvidia":
for _, cmd := range techDumpNvidiaCommands {
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
}
case "amd":
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
}
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
@@ -69,6 +86,14 @@ func writeCommandDump(path, name string, args ...string) {
_ = os.WriteFile(path, out, 0644)
}
func writeROCmSMIDump(path string, args ...string) {
out, err := runROCmSMI(args...)
if err != nil && len(out) == 0 {
return
}
_ = os.WriteFile(path, out, 0644)
}
func lsblkDumpDevices(path string) []string {
raw, err := os.ReadFile(path)
if err != nil {
@@ -80,6 +105,9 @@ func lsblkDumpDevices(path string) []string {
}
var devices []string
for _, dev := range root.Blockdevices {
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
continue
}
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
devices = append(devices, strings.TrimSpace(dev.Name))
}

View File

@@ -12,12 +12,12 @@ func TestLSBLKDumpDevices(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "lsblk.json")
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk"}]}`), 0644); err != nil {
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
t.Fatalf("write lsblk fixture: %v", err)
}
got := lsblkDumpDevices(path)
want := []string{"nvme0n1", "sda"}
want := []string{"nvme0n1", "sdb"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
}

View File

@@ -60,6 +60,14 @@ apt-get update -qq
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
echo "=== AMD ROCm: rocm-smi installed ==="
if [ -x /opt/rocm/bin/rocm-smi ]; then
ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
else
candidate="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
if [ -n "${candidate}" ]; then
ln -sf "${candidate}" /usr/local/bin/rocm-smi
fi
fi
rocm-smi --version 2>/dev/null || true
else
echo "WARN: rocm-smi-lib install failed — GPU monitoring unavailable"

View File

@@ -1,4 +1,4 @@
export PATH="$PATH:/usr/local/bin"
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
menu() {
if [ -x /usr/local/bin/bee-tui ]; then