Tighten support bundles and fix AMD runtime checks

This commit is contained in:
Mikhail Chusavitin
2026-03-25 19:35:25 +03:00
parent 30cf014d58
commit 9a1df9b1ba
12 changed files with 663 additions and 79 deletions

View File

@@ -9,8 +9,10 @@ import (
"strings"
)
var exportExecCommand = exec.Command
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
if err != nil {
return nil, err
}
@@ -52,7 +54,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
return out, nil
}
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
if src == "" || target.Device == "" {
return "", fmt.Errorf("source and target are required")
}
@@ -62,20 +64,39 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
mountpoint := strings.TrimSpace(target.Mountpoint)
mountedHere := false
mounted := mountpoint != ""
if mountpoint == "" {
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
if err := os.MkdirAll(mountpoint, 0755); err != nil {
return "", err
}
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
_ = os.Remove(mountpoint)
return string(raw), err
}
mountedHere = true
mounted = true
}
defer func() {
if !mounted {
return
}
_ = exportExecCommand("sync").Run()
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
msg := strings.TrimSpace(string(raw))
if msg == "" {
retErr = err
} else {
retErr = fmt.Errorf("%s: %w", msg, err)
}
}
if mountedHere {
_ = os.Remove(mountpoint)
}
}()
filename := filepath.Base(src)
dst := filepath.Join(mountpoint, filename)
dst = filepath.Join(mountpoint, filename)
data, err := os.ReadFile(src)
if err != nil {
return "", err
@@ -83,12 +104,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
if err := os.WriteFile(dst, data, 0644); err != nil {
return "", err
}
_ = exec.Command("sync").Run()
if mountedHere {
_ = exec.Command("umount", mountpoint).Run()
_ = os.Remove(mountpoint)
}
return dst, nil
}

View File

@@ -0,0 +1,56 @@
package platform
import (
"os"
"os/exec"
"path/filepath"
"testing"
)
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt")
if err := os.MkdirAll(mountpoint, 0755); err != nil {
t.Fatalf("mkdir mountpoint: %v", err)
}
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
t.Fatalf("write src: %v", err)
}
var calls [][]string
oldExec := exportExecCommand
exportExecCommand = func(name string, args ...string) *exec.Cmd {
calls = append(calls, append([]string{name}, args...))
return exec.Command("sh", "-c", "exit 0")
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
dst, err := s.ExportFileToTarget(src, RemovableTarget{
Device: "/dev/sdb1",
Mountpoint: mountpoint,
})
if err != nil {
t.Fatalf("ExportFileToTarget error: %v", err)
}
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
t.Fatalf("dst=%q want %q", got, want)
}
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
t.Fatalf("exported file missing: %v", err)
}
foundUmount := false
for _, call := range calls {
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
foundUmount = true
break
}
}
if !foundUmount {
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
}
}

View File

@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
"smartctl",
"nvme",
"ipmitool",
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
"dhclient",
"mount",
}
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
}
}
for _, tool := range s.CheckTools(runtimeRequiredTools) {
vendor := s.DetectGPUVendor()
for _, tool := range s.runtimeToolStatuses(vendor) {
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
Name: tool.Name,
Path: tool.Path,
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
})
}
lsmodText := commandText("lsmod")
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
health.CUDAReady = false
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
s.collectGPURuntimeHealth(vendor, &health)
if health.Status != "FAILED" && len(health.Issues) > 0 {
health.Status = "PARTIAL"
@@ -162,3 +128,87 @@ func commandText(name string, args ...string) string {
}
return string(raw)
}
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
tools := s.CheckTools(runtimeRequiredTools)
switch vendor {
case "nvidia":
tools = append(tools, s.CheckTools([]string{
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
})...)
case "amd":
tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
tool.Path = cmd[0]
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
tool.Path = cmd[1]
}
tool.OK = true
}
tools = append(tools, tool)
}
return tools
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod")
switch vendor {
case "nvidia":
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
case "amd":
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "amdgpu_kernel_module_missing",
Severity: "warning",
Description: "AMD GPU driver is not loaded.",
})
}
out, err := runROCmSMI("--showproductname", "--csv")
if err == nil && strings.TrimSpace(string(out)) != "" {
health.CUDAReady = true
health.DriverReady = true
return
}
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "rocm_smi_unavailable",
Severity: "warning",
Description: "ROCm SMI is not available for AMD GPU SAT.",
})
}
}

View File

@@ -4,6 +4,7 @@ import (
"archive/tar"
"compress/gzip"
"context"
"errors"
"fmt"
"io"
"os"
@@ -15,6 +16,22 @@ import (
"time"
)
var (
satExecCommand = exec.Command
satLookPath = exec.LookPath
satGlob = filepath.Glob
satStat = os.Stat
rocmSMIExecutableGlobs = []string{
"/opt/rocm/bin/rocm-smi",
"/opt/rocm-*/bin/rocm-smi",
}
rocmSMIScriptGlobs = []string{
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
}
)
// NvidiaGPU holds basic GPU info from nvidia-smi.
type NvidiaGPU struct {
Index int
@@ -41,7 +58,7 @@ func (s *System) DetectGPUVendor() string {
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output()
out, err := runROCmSMI("--showproductname", "--csv")
if err != nil {
return nil, fmt.Errorf("rocm-smi: %w", err)
}
@@ -337,12 +354,22 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(cmd, " "),
"cmd: "+strings.Join(resolvedCmd, " "),
)
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
"rc: 1",
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
"",
)
return []byte(err.Error() + "\n"), err
}
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
if len(env) > 0 {
c.Env = append(os.Environ(), env...)
}
@@ -362,19 +389,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
}
func listStorageDevices() ([]string, error) {
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
if err != nil {
return nil, err
}
var devices []string
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) != 2 || fields[1] != "disk" {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices, nil
return parseStorageDevices(string(out)), nil
}
func storageSATCommands(devPath string) []satJob {
@@ -445,12 +464,22 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(cmd, " "),
"cmd: "+strings.Join(resolvedCmd, " "),
)
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
"rc: 1",
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
"",
)
return []byte(err.Error() + "\n"), err
}
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
rc := 0
if err != nil {
@@ -465,6 +494,91 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
return out, err
}
func runROCmSMI(args ...string) ([]byte, error) {
cmd, err := resolveROCmSMICommand(args...)
if err != nil {
return nil, err
}
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
}
func resolveSATCommand(cmd []string) ([]string, error) {
if len(cmd) == 0 {
return nil, errors.New("empty SAT command")
}
if cmd[0] != "rocm-smi" {
return cmd, nil
}
return resolveROCmSMICommand(cmd[1:]...)
}
func resolveROCmSMICommand(args ...string) ([]string, error) {
if path, err := satLookPath("rocm-smi"); err == nil {
return append([]string{path}, args...), nil
}
for _, path := range rocmSMIExecutableCandidates() {
return append([]string{path}, args...), nil
}
pythonPath, pyErr := satLookPath("python3")
if pyErr == nil {
for _, script := range rocmSMIScriptCandidates() {
cmd := []string{pythonPath, script}
cmd = append(cmd, args...)
return cmd, nil
}
}
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
}
func rocmSMIExecutableCandidates() []string {
return expandExistingPaths(rocmSMIExecutableGlobs)
}
func rocmSMIScriptCandidates() []string {
return expandExistingPaths(rocmSMIScriptGlobs)
}
func expandExistingPaths(patterns []string) []string {
seen := make(map[string]struct{})
var paths []string
for _, pattern := range patterns {
matches, err := satGlob(pattern)
if err != nil {
continue
}
sort.Strings(matches)
for _, match := range matches {
if _, err := satStat(match); err != nil {
continue
}
if _, ok := seen[match]; ok {
continue
}
seen[match] = struct{}{}
paths = append(paths, match)
}
}
return paths
}
func parseStorageDevices(raw string) []string {
var devices []string
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) < 2 || fields[1] != "disk" {
continue
}
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices
}
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {

View File

@@ -3,6 +3,8 @@ package platform
import (
"errors"
"os"
"os/exec"
"path/filepath"
"testing"
)
@@ -91,3 +93,90 @@ func TestClassifySATResult(t *testing.T) {
})
}
}
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
t.Parallel()
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
got := parseStorageDevices(raw)
want := []string{"/dev/nvme0n1", "/dev/sdb"}
if len(got) != len(want) {
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
}
}
}
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
t.Setenv("PATH", t.TempDir())
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
t.Fatalf("write rocm-smi: %v", err)
}
cmd, err := resolveROCmSMICommand("--showproductname")
if err != nil {
t.Fatalf("resolveROCmSMICommand error: %v", err)
}
if len(cmd) != 2 {
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
}
if cmd[0] != toolPath {
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
}
}
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
tmp := t.TempDir()
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
t.Fatalf("mkdir: %v", err)
}
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
t.Fatalf("write rocm-smi: %v", err)
}
oldGlob := rocmSMIExecutableGlobs
oldScriptGlobs := rocmSMIScriptGlobs
rocmSMIExecutableGlobs = []string{execPath}
rocmSMIScriptGlobs = nil
t.Cleanup(func() {
rocmSMIExecutableGlobs = oldGlob
rocmSMIScriptGlobs = oldScriptGlobs
})
t.Setenv("PATH", "")
cmd, err := resolveROCmSMICommand("--showallinfo")
if err != nil {
t.Fatalf("resolveROCmSMICommand error: %v", err)
}
if len(cmd) != 2 {
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
}
if cmd[0] != execPath {
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
}
}
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
oldLookPath := satLookPath
oldExecGlobs := rocmSMIExecutableGlobs
oldScriptGlobs := rocmSMIScriptGlobs
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
rocmSMIExecutableGlobs = nil
rocmSMIScriptGlobs = nil
t.Cleanup(func() {
satLookPath = oldLookPath
rocmSMIExecutableGlobs = oldExecGlobs
rocmSMIScriptGlobs = oldScriptGlobs
})
if _, err := runROCmSMI("--showproductname"); err == nil {
t.Fatal("expected missing rocm-smi error")
}
}

View File

@@ -24,15 +24,23 @@ var techDumpFixedCommands = []struct {
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
}
var techDumpNvidiaCommands = []struct {
Name string
Args []string
File string
}{
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
}
type lsblkDumpRoot struct {
Blockdevices []struct {
Name string `json:"name"`
Type string `json:"type"`
Tran string `json:"tran"`
} `json:"blockdevices"`
}
@@ -50,6 +58,15 @@ func (s *System) CaptureTechnicalDump(baseDir string) error {
for _, cmd := range techDumpFixedCommands {
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
}
switch s.DetectGPUVendor() {
case "nvidia":
for _, cmd := range techDumpNvidiaCommands {
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
}
case "amd":
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
}
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
@@ -69,6 +86,14 @@ func writeCommandDump(path, name string, args ...string) {
_ = os.WriteFile(path, out, 0644)
}
func writeROCmSMIDump(path string, args ...string) {
out, err := runROCmSMI(args...)
if err != nil && len(out) == 0 {
return
}
_ = os.WriteFile(path, out, 0644)
}
func lsblkDumpDevices(path string) []string {
raw, err := os.ReadFile(path)
if err != nil {
@@ -80,6 +105,9 @@ func lsblkDumpDevices(path string) []string {
}
var devices []string
for _, dev := range root.Blockdevices {
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
continue
}
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
devices = append(devices, strings.TrimSpace(dev.Name))
}

View File

@@ -12,12 +12,12 @@ func TestLSBLKDumpDevices(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "lsblk.json")
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk"}]}`), 0644); err != nil {
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
t.Fatalf("write lsblk fixture: %v", err)
}
got := lsblkDumpDevices(path)
want := []string{"nvme0n1", "sda"}
want := []string{"nvme0n1", "sdb"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
}