Tighten support bundles and fix AMD runtime checks
This commit is contained in:
@@ -173,11 +173,20 @@ func (a *App) RuntimeHealthResult() ActionResult {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
|
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
|
||||||
}
|
}
|
||||||
|
driverLabel := "Driver ready"
|
||||||
|
accelLabel := "CUDA ready"
|
||||||
|
switch a.sat.DetectGPUVendor() {
|
||||||
|
case "amd":
|
||||||
|
driverLabel = "AMDGPU ready"
|
||||||
|
accelLabel = "ROCm SMI ready"
|
||||||
|
case "nvidia":
|
||||||
|
driverLabel = "NVIDIA ready"
|
||||||
|
}
|
||||||
var body strings.Builder
|
var body strings.Builder
|
||||||
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
|
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
|
||||||
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
|
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
|
||||||
fmt.Fprintf(&body, "Driver ready: %t\n", health.DriverReady)
|
fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
|
||||||
fmt.Fprintf(&body, "CUDA ready: %t\n", health.CUDAReady)
|
fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
|
||||||
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
|
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
|
||||||
if len(health.Issues) > 0 {
|
if len(health.Issues) > 0 {
|
||||||
body.WriteString("\n\nIssues:\n")
|
body.WriteString("\n\nIssues:\n")
|
||||||
@@ -238,9 +247,9 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
|
|||||||
|
|
||||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||||
path, err := a.ExportSupportBundle(target)
|
path, err := a.ExportSupportBundle(target)
|
||||||
body := "Support bundle exported."
|
body := "Support bundle exported. USB target unmounted and safe to remove."
|
||||||
if path != "" {
|
if path != "" {
|
||||||
body = "Support bundle exported to " + path
|
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||||
}
|
}
|
||||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
package app
|
package app
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"compress/gzip"
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -57,13 +60,22 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
|
|||||||
return f.serviceDoFn(name, action)
|
return f.serviceDoFn(name, action)
|
||||||
}
|
}
|
||||||
|
|
||||||
type fakeExports struct{}
|
type fakeExports struct {
|
||||||
|
listTargetsFn func() ([]platform.RemovableTarget, error)
|
||||||
|
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||||
|
if f.listTargetsFn != nil {
|
||||||
|
return f.listTargetsFn()
|
||||||
|
}
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
|
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
|
||||||
|
if f.exportToTargetFn != nil {
|
||||||
|
return f.exportToTargetFn(src, target)
|
||||||
|
}
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,10 +109,14 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type fakeSAT struct {
|
type fakeSAT struct {
|
||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runMemoryFn func(string) (string, error)
|
runMemoryFn func(string) (string, error)
|
||||||
runStorageFn func(string) (string, error)
|
runStorageFn func(string) (string, error)
|
||||||
runCPUFn func(string, int) (string, error)
|
runCPUFn func(string, int) (string, error)
|
||||||
|
detectVendorFn func() string
|
||||||
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
|
runAMDPackFn func(string) (string, error)
|
||||||
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||||
@@ -112,6 +128,9 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||||
|
if f.listNvidiaGPUsFn != nil {
|
||||||
|
return f.listNvidiaGPUsFn()
|
||||||
|
}
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,11 +149,26 @@ func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string,
|
|||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) DetectGPUVendor() string { return "" }
|
func (f fakeSAT) DetectGPUVendor() string {
|
||||||
|
if f.detectVendorFn != nil {
|
||||||
|
return f.detectVendorFn()
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil }
|
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||||
|
if f.listAMDGPUsFn != nil {
|
||||||
|
return f.listAMDGPUsFn()
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil }
|
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||||
|
if f.runAMDPackFn != nil {
|
||||||
|
return f.runAMDPackFn(baseDir)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
@@ -394,6 +428,44 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldExportDir := DefaultExportDir
|
||||||
|
DefaultExportDir = tmp
|
||||||
|
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write bee-audit.json: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write bee-audit.log: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
exports: fakeExports{
|
||||||
|
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
|
||||||
|
if filepath.Base(src) == "" {
|
||||||
|
t.Fatalf("expected non-empty source path")
|
||||||
|
}
|
||||||
|
return "/media/bee/" + filepath.Base(src), nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExportSupportBundleResult error: %v", err)
|
||||||
|
}
|
||||||
|
if result.Title != "Export support bundle" {
|
||||||
|
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
|
||||||
|
}
|
||||||
|
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
|
||||||
|
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -516,6 +588,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
archive, err := BuildSupportBundle(exportDir)
|
archive, err := BuildSupportBundle(exportDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -524,6 +599,44 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if _, err := os.Stat(archive); err != nil {
|
if _, err := os.Stat(archive); err != nil {
|
||||||
t.Fatalf("archive stat: %v", err)
|
t.Fatalf("archive stat: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
file, err := os.Open(archive)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open archive: %v", err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
gzr, err := gzip.NewReader(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("gzip reader: %v", err)
|
||||||
|
}
|
||||||
|
defer gzr.Close()
|
||||||
|
|
||||||
|
tr := tar.NewReader(gzr)
|
||||||
|
var names []string
|
||||||
|
for {
|
||||||
|
hdr, err := tr.Next()
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read tar entry: %v", err)
|
||||||
|
}
|
||||||
|
names = append(names, hdr.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
var foundRaw bool
|
||||||
|
for _, name := range names {
|
||||||
|
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||||
|
foundRaw = true
|
||||||
|
}
|
||||||
|
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
|
||||||
|
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !foundRaw {
|
||||||
|
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMainBanner(t *testing.T) {
|
func TestMainBanner(t *testing.T) {
|
||||||
@@ -600,6 +713,44 @@ func TestMainBanner(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldRuntimePath := DefaultRuntimeJSONPath
|
||||||
|
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
|
||||||
|
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
|
||||||
|
|
||||||
|
raw, err := json.Marshal(schema.RuntimeHealth{
|
||||||
|
Status: "OK",
|
||||||
|
ExportDir: "/appdata/bee/export",
|
||||||
|
DriverReady: true,
|
||||||
|
CUDAReady: true,
|
||||||
|
NetworkStatus: "OK",
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal runtime health: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
|
||||||
|
t.Fatalf("write runtime health: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
sat: fakeSAT{
|
||||||
|
detectVendorFn: func() string { return "amd" },
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result := a.RuntimeHealthResult()
|
||||||
|
if !contains(result.Body, "AMDGPU ready: true") {
|
||||||
|
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
if !contains(result.Body, "ROCm SMI ready: true") {
|
||||||
|
t.Fatalf("body missing ROCm label:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
if contains(result.Body, "CUDA ready") {
|
||||||
|
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func intPtr(v int) *int { return &v }
|
func intPtr(v int) *int { return &v }
|
||||||
|
|
||||||
func contains(haystack, needle string) bool {
|
func contains(haystack, needle string) bool {
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
}
|
}
|
||||||
defer os.RemoveAll(stageRoot)
|
defer os.RemoveAll(stageRoot)
|
||||||
|
|
||||||
if err := copyDirContents(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
|
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
|
||||||
@@ -214,6 +214,40 @@ func copyDirContents(srcDir, dstDir string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||||
|
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||||
|
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||||
|
if cleanRel == "" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||||
|
entries, err := os.ReadDir(srcDir)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
src := filepath.Join(srcDir, entry.Name())
|
||||||
|
dst := filepath.Join(dstDir, entry.Name())
|
||||||
|
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func copyPath(src, dst string) error {
|
func copyPath(src, dst string) error {
|
||||||
info, err := os.Stat(src)
|
info, err := os.Stat(src)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -254,6 +288,36 @@ func copyPath(src, dst string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
|
||||||
|
info, err := os.Stat(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(rootSrc, src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if keep != nil && !keep(rel, info) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
entries, err := os.ReadDir(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return copyPath(src, dst)
|
||||||
|
}
|
||||||
|
|
||||||
func createSupportTarGz(dst, srcDir string) error {
|
func createSupportTarGz(dst, srcDir string) error {
|
||||||
file, err := os.Create(dst)
|
file, err := os.Create(dst)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -9,8 +9,10 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var exportExecCommand = exec.Command
|
||||||
|
|
||||||
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||||
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -52,7 +54,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
|||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
|
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
|
||||||
if src == "" || target.Device == "" {
|
if src == "" || target.Device == "" {
|
||||||
return "", fmt.Errorf("source and target are required")
|
return "", fmt.Errorf("source and target are required")
|
||||||
}
|
}
|
||||||
@@ -62,20 +64,39 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
|||||||
|
|
||||||
mountpoint := strings.TrimSpace(target.Mountpoint)
|
mountpoint := strings.TrimSpace(target.Mountpoint)
|
||||||
mountedHere := false
|
mountedHere := false
|
||||||
|
mounted := mountpoint != ""
|
||||||
if mountpoint == "" {
|
if mountpoint == "" {
|
||||||
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
|
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
|
||||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||||
_ = os.Remove(mountpoint)
|
_ = os.Remove(mountpoint)
|
||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
mountedHere = true
|
mountedHere = true
|
||||||
|
mounted = true
|
||||||
}
|
}
|
||||||
|
defer func() {
|
||||||
|
if !mounted {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = exportExecCommand("sync").Run()
|
||||||
|
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
|
||||||
|
msg := strings.TrimSpace(string(raw))
|
||||||
|
if msg == "" {
|
||||||
|
retErr = err
|
||||||
|
} else {
|
||||||
|
retErr = fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if mountedHere {
|
||||||
|
_ = os.Remove(mountpoint)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
filename := filepath.Base(src)
|
filename := filepath.Base(src)
|
||||||
dst := filepath.Join(mountpoint, filename)
|
dst = filepath.Join(mountpoint, filename)
|
||||||
data, err := os.ReadFile(src)
|
data, err := os.ReadFile(src)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
@@ -83,12 +104,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
|||||||
if err := os.WriteFile(dst, data, 0644); err != nil {
|
if err := os.WriteFile(dst, data, 0644); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
_ = exec.Command("sync").Run()
|
|
||||||
|
|
||||||
if mountedHere {
|
|
||||||
_ = exec.Command("umount", mountpoint).Run()
|
|
||||||
_ = os.Remove(mountpoint)
|
|
||||||
}
|
|
||||||
|
|
||||||
return dst, nil
|
return dst, nil
|
||||||
}
|
}
|
||||||
|
|||||||
56
audit/internal/platform/export_test.go
Normal file
56
audit/internal/platform/export_test.go
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tmp := t.TempDir()
|
||||||
|
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||||
|
mountpoint := filepath.Join(tmp, "mnt")
|
||||||
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir mountpoint: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||||
|
t.Fatalf("write src: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var calls [][]string
|
||||||
|
oldExec := exportExecCommand
|
||||||
|
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
calls = append(calls, append([]string{name}, args...))
|
||||||
|
return exec.Command("sh", "-c", "exit 0")
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||||
|
|
||||||
|
s := &System{}
|
||||||
|
dst, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
Mountpoint: mountpoint,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ExportFileToTarget error: %v", err)
|
||||||
|
}
|
||||||
|
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
|
||||||
|
t.Fatalf("dst=%q want %q", got, want)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
|
||||||
|
t.Fatalf("exported file missing: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
foundUmount := false
|
||||||
|
for _, call := range calls {
|
||||||
|
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
|
||||||
|
foundUmount = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !foundUmount {
|
||||||
|
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
|
|||||||
"smartctl",
|
"smartctl",
|
||||||
"nvme",
|
"nvme",
|
||||||
"ipmitool",
|
"ipmitool",
|
||||||
"nvidia-smi",
|
|
||||||
"nvidia-bug-report.sh",
|
|
||||||
"bee-gpu-stress",
|
|
||||||
"dhclient",
|
"dhclient",
|
||||||
"mount",
|
"mount",
|
||||||
}
|
}
|
||||||
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tool := range s.CheckTools(runtimeRequiredTools) {
|
vendor := s.DetectGPUVendor()
|
||||||
|
for _, tool := range s.runtimeToolStatuses(vendor) {
|
||||||
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
||||||
Name: tool.Name,
|
Name: tool.Name,
|
||||||
Path: tool.Path,
|
Path: tool.Path,
|
||||||
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
lsmodText := commandText("lsmod")
|
s.collectGPURuntimeHealth(vendor, &health)
|
||||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
|
||||||
if !health.DriverReady {
|
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
||||||
Code: "nvidia_kernel_module_missing",
|
|
||||||
Severity: "warning",
|
|
||||||
Description: "NVIDIA kernel module is not loaded.",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
||||||
Code: "nvidia_modeset_failed",
|
|
||||||
Severity: "warning",
|
|
||||||
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
|
||||||
health.DriverReady = true
|
|
||||||
}
|
|
||||||
|
|
||||||
health.CUDAReady = false
|
|
||||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
|
||||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
|
||||||
if err == nil {
|
|
||||||
health.CUDAReady = true
|
|
||||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
||||||
Code: "cuda_runtime_not_ready",
|
|
||||||
Severity: "warning",
|
|
||||||
Description: "CUDA runtime is not ready for GPU SAT.",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||||
health.Status = "PARTIAL"
|
health.Status = "PARTIAL"
|
||||||
@@ -162,3 +128,87 @@ func commandText(name string, args ...string) string {
|
|||||||
}
|
}
|
||||||
return string(raw)
|
return string(raw)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||||
|
tools := s.CheckTools(runtimeRequiredTools)
|
||||||
|
switch vendor {
|
||||||
|
case "nvidia":
|
||||||
|
tools = append(tools, s.CheckTools([]string{
|
||||||
|
"nvidia-smi",
|
||||||
|
"nvidia-bug-report.sh",
|
||||||
|
"bee-gpu-stress",
|
||||||
|
})...)
|
||||||
|
case "amd":
|
||||||
|
tool := ToolStatus{Name: "rocm-smi"}
|
||||||
|
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||||
|
tool.Path = cmd[0]
|
||||||
|
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
|
||||||
|
tool.Path = cmd[1]
|
||||||
|
}
|
||||||
|
tool.OK = true
|
||||||
|
}
|
||||||
|
tools = append(tools, tool)
|
||||||
|
}
|
||||||
|
return tools
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||||
|
lsmodText := commandText("lsmod")
|
||||||
|
|
||||||
|
switch vendor {
|
||||||
|
case "nvidia":
|
||||||
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||||
|
if !health.DriverReady {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_kernel_module_missing",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "NVIDIA kernel module is not loaded.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_modeset_failed",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
||||||
|
health.DriverReady = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
||||||
|
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||||
|
if err == nil {
|
||||||
|
health.CUDAReady = true
|
||||||
|
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "cuda_runtime_not_ready",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "CUDA runtime is not ready for GPU SAT.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "amd":
|
||||||
|
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
|
||||||
|
if !health.DriverReady {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "amdgpu_kernel_module_missing",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "AMD GPU driver is not loaded.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := runROCmSMI("--showproductname", "--csv")
|
||||||
|
if err == nil && strings.TrimSpace(string(out)) != "" {
|
||||||
|
health.CUDAReady = true
|
||||||
|
health.DriverReady = true
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "rocm_smi_unavailable",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "ROCm SMI is not available for AMD GPU SAT.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"archive/tar"
|
"archive/tar"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
@@ -15,6 +16,22 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
satExecCommand = exec.Command
|
||||||
|
satLookPath = exec.LookPath
|
||||||
|
satGlob = filepath.Glob
|
||||||
|
satStat = os.Stat
|
||||||
|
|
||||||
|
rocmSMIExecutableGlobs = []string{
|
||||||
|
"/opt/rocm/bin/rocm-smi",
|
||||||
|
"/opt/rocm-*/bin/rocm-smi",
|
||||||
|
}
|
||||||
|
rocmSMIScriptGlobs = []string{
|
||||||
|
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||||
|
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||||
type NvidiaGPU struct {
|
type NvidiaGPU struct {
|
||||||
Index int
|
Index int
|
||||||
@@ -41,7 +58,7 @@ func (s *System) DetectGPUVendor() string {
|
|||||||
|
|
||||||
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
|
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
|
||||||
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||||||
out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output()
|
out, err := runROCmSMI("--showproductname", "--csv")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("rocm-smi: %w", err)
|
return nil, fmt.Errorf("rocm-smi: %w", err)
|
||||||
}
|
}
|
||||||
@@ -337,12 +354,22 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
|
|
||||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
||||||
start := time.Now().UTC()
|
start := time.Now().UTC()
|
||||||
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||||
"cmd: "+strings.Join(cmd, " "),
|
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||||
)
|
)
|
||||||
|
if err != nil {
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
|
"rc: 1",
|
||||||
|
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return []byte(err.Error() + "\n"), err
|
||||||
|
}
|
||||||
|
|
||||||
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
|
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||||
if len(env) > 0 {
|
if len(env) > 0 {
|
||||||
c.Env = append(os.Environ(), env...)
|
c.Env = append(os.Environ(), env...)
|
||||||
}
|
}
|
||||||
@@ -362,19 +389,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
|||||||
}
|
}
|
||||||
|
|
||||||
func listStorageDevices() ([]string, error) {
|
func listStorageDevices() ([]string, error) {
|
||||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
|
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
var devices []string
|
return parseStorageDevices(string(out)), nil
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
||||||
fields := strings.Fields(strings.TrimSpace(line))
|
|
||||||
if len(fields) != 2 || fields[1] != "disk" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
devices = append(devices, "/dev/"+fields[0])
|
|
||||||
}
|
|
||||||
return devices, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func storageSATCommands(devPath string) []satJob {
|
func storageSATCommands(devPath string) []satJob {
|
||||||
@@ -445,12 +464,22 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
|
|
||||||
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||||
start := time.Now().UTC()
|
start := time.Now().UTC()
|
||||||
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||||
"cmd: "+strings.Join(cmd, " "),
|
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||||
)
|
)
|
||||||
|
if err != nil {
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
|
"rc: 1",
|
||||||
|
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return []byte(err.Error() + "\n"), err
|
||||||
|
}
|
||||||
|
|
||||||
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
|
||||||
|
|
||||||
rc := 0
|
rc := 0
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -465,6 +494,91 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
|||||||
return out, err
|
return out, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runROCmSMI(args ...string) ([]byte, error) {
|
||||||
|
cmd, err := resolveROCmSMICommand(args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveSATCommand(cmd []string) ([]string, error) {
|
||||||
|
if len(cmd) == 0 {
|
||||||
|
return nil, errors.New("empty SAT command")
|
||||||
|
}
|
||||||
|
if cmd[0] != "rocm-smi" {
|
||||||
|
return cmd, nil
|
||||||
|
}
|
||||||
|
return resolveROCmSMICommand(cmd[1:]...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||||
|
if path, err := satLookPath("rocm-smi"); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, path := range rocmSMIExecutableCandidates() {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
pythonPath, pyErr := satLookPath("python3")
|
||||||
|
if pyErr == nil {
|
||||||
|
for _, script := range rocmSMIScriptCandidates() {
|
||||||
|
cmd := []string{pythonPath, script}
|
||||||
|
cmd = append(cmd, args...)
|
||||||
|
return cmd, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||||
|
}
|
||||||
|
|
||||||
|
func rocmSMIExecutableCandidates() []string {
|
||||||
|
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||||
|
}
|
||||||
|
|
||||||
|
func rocmSMIScriptCandidates() []string {
|
||||||
|
return expandExistingPaths(rocmSMIScriptGlobs)
|
||||||
|
}
|
||||||
|
|
||||||
|
func expandExistingPaths(patterns []string) []string {
|
||||||
|
seen := make(map[string]struct{})
|
||||||
|
var paths []string
|
||||||
|
for _, pattern := range patterns {
|
||||||
|
matches, err := satGlob(pattern)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
for _, match := range matches {
|
||||||
|
if _, err := satStat(match); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[match]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[match] = struct{}{}
|
||||||
|
paths = append(paths, match)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return paths
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseStorageDevices(raw string) []string {
|
||||||
|
var devices []string
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
fields := strings.Fields(strings.TrimSpace(line))
|
||||||
|
if len(fields) < 2 || fields[1] != "disk" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devices = append(devices, "/dev/"+fields[0])
|
||||||
|
}
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package platform
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -91,3 +93,90 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
|
||||||
|
got := parseStorageDevices(raw)
|
||||||
|
want := []string{"/dev/nvme0n1", "/dev/sdb"}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
||||||
|
t.Setenv("PATH", t.TempDir())
|
||||||
|
|
||||||
|
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
|
||||||
|
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||||
|
t.Fatalf("write rocm-smi: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd, err := resolveROCmSMICommand("--showproductname")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 2 {
|
||||||
|
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != toolPath {
|
||||||
|
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||||
|
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||||
|
t.Fatalf("write rocm-smi: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
oldGlob := rocmSMIExecutableGlobs
|
||||||
|
oldScriptGlobs := rocmSMIScriptGlobs
|
||||||
|
rocmSMIExecutableGlobs = []string{execPath}
|
||||||
|
rocmSMIScriptGlobs = nil
|
||||||
|
t.Cleanup(func() {
|
||||||
|
rocmSMIExecutableGlobs = oldGlob
|
||||||
|
rocmSMIScriptGlobs = oldScriptGlobs
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Setenv("PATH", "")
|
||||||
|
|
||||||
|
cmd, err := resolveROCmSMICommand("--showallinfo")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 2 {
|
||||||
|
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != execPath {
|
||||||
|
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
oldExecGlobs := rocmSMIExecutableGlobs
|
||||||
|
oldScriptGlobs := rocmSMIScriptGlobs
|
||||||
|
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
|
||||||
|
rocmSMIExecutableGlobs = nil
|
||||||
|
rocmSMIScriptGlobs = nil
|
||||||
|
t.Cleanup(func() {
|
||||||
|
satLookPath = oldLookPath
|
||||||
|
rocmSMIExecutableGlobs = oldExecGlobs
|
||||||
|
rocmSMIScriptGlobs = oldScriptGlobs
|
||||||
|
})
|
||||||
|
|
||||||
|
if _, err := runROCmSMI("--showproductname"); err == nil {
|
||||||
|
t.Fatal("expected missing rocm-smi error")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -24,15 +24,23 @@ var techDumpFixedCommands = []struct {
|
|||||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||||
|
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||||
|
}
|
||||||
|
|
||||||
|
var techDumpNvidiaCommands = []struct {
|
||||||
|
Name string
|
||||||
|
Args []string
|
||||||
|
File string
|
||||||
|
}{
|
||||||
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
||||||
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
||||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkDumpRoot struct {
|
type lsblkDumpRoot struct {
|
||||||
Blockdevices []struct {
|
Blockdevices []struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
|
Tran string `json:"tran"`
|
||||||
} `json:"blockdevices"`
|
} `json:"blockdevices"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -50,6 +58,15 @@ func (s *System) CaptureTechnicalDump(baseDir string) error {
|
|||||||
for _, cmd := range techDumpFixedCommands {
|
for _, cmd := range techDumpFixedCommands {
|
||||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||||
}
|
}
|
||||||
|
switch s.DetectGPUVendor() {
|
||||||
|
case "nvidia":
|
||||||
|
for _, cmd := range techDumpNvidiaCommands {
|
||||||
|
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||||
|
}
|
||||||
|
case "amd":
|
||||||
|
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
|
||||||
|
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
|
||||||
|
}
|
||||||
|
|
||||||
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
|
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
|
||||||
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
|
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
|
||||||
@@ -69,6 +86,14 @@ func writeCommandDump(path, name string, args ...string) {
|
|||||||
_ = os.WriteFile(path, out, 0644)
|
_ = os.WriteFile(path, out, 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func writeROCmSMIDump(path string, args ...string) {
|
||||||
|
out, err := runROCmSMI(args...)
|
||||||
|
if err != nil && len(out) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(path, out, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
func lsblkDumpDevices(path string) []string {
|
func lsblkDumpDevices(path string) []string {
|
||||||
raw, err := os.ReadFile(path)
|
raw, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -80,6 +105,9 @@ func lsblkDumpDevices(path string) []string {
|
|||||||
}
|
}
|
||||||
var devices []string
|
var devices []string
|
||||||
for _, dev := range root.Blockdevices {
|
for _, dev := range root.Blockdevices {
|
||||||
|
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
|
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
|
||||||
devices = append(devices, strings.TrimSpace(dev.Name))
|
devices = append(devices, strings.TrimSpace(dev.Name))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,12 +12,12 @@ func TestLSBLKDumpDevices(t *testing.T) {
|
|||||||
|
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "lsblk.json")
|
path := filepath.Join(dir, "lsblk.json")
|
||||||
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk"}]}`), 0644); err != nil {
|
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
|
||||||
t.Fatalf("write lsblk fixture: %v", err)
|
t.Fatalf("write lsblk fixture: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
got := lsblkDumpDevices(path)
|
got := lsblkDumpDevices(path)
|
||||||
want := []string{"nvme0n1", "sda"}
|
want := []string{"nvme0n1", "sdb"}
|
||||||
if !reflect.DeepEqual(got, want) {
|
if !reflect.DeepEqual(got, want) {
|
||||||
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
|
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -60,6 +60,14 @@ apt-get update -qq
|
|||||||
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
|
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
|
||||||
if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
|
if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
|
||||||
echo "=== AMD ROCm: rocm-smi installed ==="
|
echo "=== AMD ROCm: rocm-smi installed ==="
|
||||||
|
if [ -x /opt/rocm/bin/rocm-smi ]; then
|
||||||
|
ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
|
||||||
|
else
|
||||||
|
candidate="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
|
||||||
|
if [ -n "${candidate}" ]; then
|
||||||
|
ln -sf "${candidate}" /usr/local/bin/rocm-smi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
rocm-smi --version 2>/dev/null || true
|
rocm-smi --version 2>/dev/null || true
|
||||||
else
|
else
|
||||||
echo "WARN: rocm-smi-lib install failed — GPU monitoring unavailable"
|
echo "WARN: rocm-smi-lib install failed — GPU monitoring unavailable"
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
export PATH="$PATH:/usr/local/bin"
|
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||||
|
|
||||||
menu() {
|
menu() {
|
||||||
if [ -x /usr/local/bin/bee-tui ]; then
|
if [ -x /usr/local/bin/bee-tui ]; then
|
||||||
|
|||||||
Reference in New Issue
Block a user