Compare commits

...

6 Commits

Author SHA1 Message Date
Mikhail Chusavitin
9a1df9b1ba Tighten support bundles and fix AMD runtime checks 2026-03-25 19:35:25 +03:00
Mikhail Chusavitin
30cf014d58 Rename NVIDIA bootloader modes 2026-03-25 19:12:26 +03:00
Mikhail Chusavitin
27d478aed6 Add bootloader choice for safe vs full NVIDIA boot 2026-03-25 19:11:15 +03:00
Mikhail Chusavitin
d36e8442a9 Stabilize live ISO consoles and NVIDIA boot path 2026-03-25 19:05:18 +03:00
Mikhail Chusavitin
b345b0d14d Derive ISO version from git tags 2026-03-25 18:40:48 +03:00
Mikhail Chusavitin
0a1ac2ab9f Bootstrap ROCm hook prerequisites in ISO build 2026-03-25 18:38:19 +03:00
28 changed files with 926 additions and 123 deletions

View File

@@ -173,11 +173,20 @@ func (a *App) RuntimeHealthResult() ActionResult {
if err != nil {
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
}
driverLabel := "Driver ready"
accelLabel := "CUDA ready"
switch a.sat.DetectGPUVendor() {
case "amd":
driverLabel = "AMDGPU ready"
accelLabel = "ROCm SMI ready"
case "nvidia":
driverLabel = "NVIDIA ready"
}
var body strings.Builder
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
fmt.Fprintf(&body, "Driver ready: %t\n", health.DriverReady)
fmt.Fprintf(&body, "CUDA ready: %t\n", health.CUDAReady)
fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
if len(health.Issues) > 0 {
body.WriteString("\n\nIssues:\n")
@@ -238,9 +247,9 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportSupportBundle(target)
body := "Support bundle exported."
body := "Support bundle exported. USB target unmounted and safe to remove."
if path != "" {
body = "Support bundle exported to " + path
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
}
return ActionResult{Title: "Export support bundle", Body: body}, err
}

View File

@@ -1,9 +1,12 @@
package app
import (
"archive/tar"
"compress/gzip"
"context"
"encoding/json"
"errors"
"io"
"os"
"path/filepath"
"testing"
@@ -57,13 +60,22 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
return f.serviceDoFn(name, action)
}
type fakeExports struct{}
type fakeExports struct {
listTargetsFn func() ([]platform.RemovableTarget, error)
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
}
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
if f.listTargetsFn != nil {
return f.listTargetsFn()
}
return nil, nil
}
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
if f.exportToTargetFn != nil {
return f.exportToTargetFn(src, target)
}
return "", nil
}
@@ -97,10 +109,14 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
}
type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
runNvidiaFn func(string) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
detectVendorFn func() string
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
}
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
@@ -112,6 +128,9 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
}
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
if f.listNvidiaGPUsFn != nil {
return f.listNvidiaGPUsFn()
}
return nil, nil
}
@@ -130,11 +149,26 @@ func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string,
return "", nil
}
func (f fakeSAT) DetectGPUVendor() string { return "" }
func (f fakeSAT) DetectGPUVendor() string {
if f.detectVendorFn != nil {
return f.detectVendorFn()
}
return ""
}
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil }
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
if f.listAMDGPUsFn != nil {
return f.listAMDGPUsFn()
}
return nil, nil
}
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil }
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
if f.runAMDPackFn != nil {
return f.runAMDPackFn(baseDir)
}
return "", nil
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()
@@ -394,6 +428,44 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
}
}
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
oldExportDir := DefaultExportDir
DefaultExportDir = tmp
t.Cleanup(func() { DefaultExportDir = oldExportDir })
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
t.Fatalf("write bee-audit.json: %v", err)
}
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
t.Fatalf("write bee-audit.log: %v", err)
}
a := &App{
exports: fakeExports{
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
if filepath.Base(src) == "" {
t.Fatalf("expected non-empty source path")
}
return "/media/bee/" + filepath.Base(src), nil
},
},
}
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
if err != nil {
t.Fatalf("ExportSupportBundleResult error: %v", err)
}
if result.Title != "Export support bundle" {
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
}
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
}
}
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
t.Parallel()
@@ -516,6 +588,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
t.Fatal(err)
}
archive, err := BuildSupportBundle(exportDir)
if err != nil {
@@ -524,6 +599,44 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if _, err := os.Stat(archive); err != nil {
t.Fatalf("archive stat: %v", err)
}
file, err := os.Open(archive)
if err != nil {
t.Fatalf("open archive: %v", err)
}
defer file.Close()
gzr, err := gzip.NewReader(file)
if err != nil {
t.Fatalf("gzip reader: %v", err)
}
defer gzr.Close()
tr := tar.NewReader(gzr)
var names []string
for {
hdr, err := tr.Next()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
t.Fatalf("read tar entry: %v", err)
}
names = append(names, hdr.Name)
}
var foundRaw bool
for _, name := range names {
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
foundRaw = true
}
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
}
}
if !foundRaw {
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
}
}
func TestMainBanner(t *testing.T) {
@@ -600,6 +713,44 @@ func TestMainBanner(t *testing.T) {
}
}
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
tmp := t.TempDir()
oldRuntimePath := DefaultRuntimeJSONPath
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
raw, err := json.Marshal(schema.RuntimeHealth{
Status: "OK",
ExportDir: "/appdata/bee/export",
DriverReady: true,
CUDAReady: true,
NetworkStatus: "OK",
})
if err != nil {
t.Fatalf("marshal runtime health: %v", err)
}
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
t.Fatalf("write runtime health: %v", err)
}
a := &App{
sat: fakeSAT{
detectVendorFn: func() string { return "amd" },
},
}
result := a.RuntimeHealthResult()
if !contains(result.Body, "AMDGPU ready: true") {
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
}
if !contains(result.Body, "ROCm SMI ready: true") {
t.Fatalf("body missing ROCm label:\n%s", result.Body)
}
if contains(result.Body, "CUDA ready") {
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
}
}
func intPtr(v int) *int { return &v }
func contains(haystack, needle string) bool {

View File

@@ -56,7 +56,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
}
defer os.RemoveAll(stageRoot)
if err := copyDirContents(exportDir, filepath.Join(stageRoot, "export")); err != nil {
if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
return "", err
}
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
@@ -214,6 +214,40 @@ func copyDirContents(srcDir, dstDir string) error {
return nil
}
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
if cleanRel == "" {
return true
}
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
return false
}
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
return false
}
return true
})
}
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
entries, err := os.ReadDir(srcDir)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
for _, entry := range entries {
src := filepath.Join(srcDir, entry.Name())
dst := filepath.Join(dstDir, entry.Name())
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
return err
}
}
return nil
}
func copyPath(src, dst string) error {
info, err := os.Stat(src)
if err != nil {
@@ -254,6 +288,36 @@ func copyPath(src, dst string) error {
return err
}
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
info, err := os.Stat(src)
if err != nil {
return err
}
rel, err := filepath.Rel(rootSrc, src)
if err != nil {
return err
}
if keep != nil && !keep(rel, info) {
return nil
}
if info.IsDir() {
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
return err
}
entries, err := os.ReadDir(src)
if err != nil {
return err
}
for _, entry := range entries {
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
return err
}
}
return nil
}
return copyPath(src, dst)
}
func createSupportTarGz(dst, srcDir string) error {
file, err := os.Create(dst)
if err != nil {

View File

@@ -9,8 +9,10 @@ import (
"strings"
)
var exportExecCommand = exec.Command
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
if err != nil {
return nil, err
}
@@ -52,7 +54,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
return out, nil
}
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
if src == "" || target.Device == "" {
return "", fmt.Errorf("source and target are required")
}
@@ -62,20 +64,39 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
mountpoint := strings.TrimSpace(target.Mountpoint)
mountedHere := false
mounted := mountpoint != ""
if mountpoint == "" {
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
if err := os.MkdirAll(mountpoint, 0755); err != nil {
return "", err
}
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
_ = os.Remove(mountpoint)
return string(raw), err
}
mountedHere = true
mounted = true
}
defer func() {
if !mounted {
return
}
_ = exportExecCommand("sync").Run()
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
msg := strings.TrimSpace(string(raw))
if msg == "" {
retErr = err
} else {
retErr = fmt.Errorf("%s: %w", msg, err)
}
}
if mountedHere {
_ = os.Remove(mountpoint)
}
}()
filename := filepath.Base(src)
dst := filepath.Join(mountpoint, filename)
dst = filepath.Join(mountpoint, filename)
data, err := os.ReadFile(src)
if err != nil {
return "", err
@@ -83,12 +104,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
if err := os.WriteFile(dst, data, 0644); err != nil {
return "", err
}
_ = exec.Command("sync").Run()
if mountedHere {
_ = exec.Command("umount", mountpoint).Run()
_ = os.Remove(mountpoint)
}
return dst, nil
}

View File

@@ -0,0 +1,56 @@
package platform
import (
"os"
"os/exec"
"path/filepath"
"testing"
)
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt")
if err := os.MkdirAll(mountpoint, 0755); err != nil {
t.Fatalf("mkdir mountpoint: %v", err)
}
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
t.Fatalf("write src: %v", err)
}
var calls [][]string
oldExec := exportExecCommand
exportExecCommand = func(name string, args ...string) *exec.Cmd {
calls = append(calls, append([]string{name}, args...))
return exec.Command("sh", "-c", "exit 0")
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
dst, err := s.ExportFileToTarget(src, RemovableTarget{
Device: "/dev/sdb1",
Mountpoint: mountpoint,
})
if err != nil {
t.Fatalf("ExportFileToTarget error: %v", err)
}
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
t.Fatalf("dst=%q want %q", got, want)
}
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
t.Fatalf("exported file missing: %v", err)
}
foundUmount := false
for _, call := range calls {
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
foundUmount = true
break
}
}
if !foundUmount {
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
}
}

View File

@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
"smartctl",
"nvme",
"ipmitool",
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
"dhclient",
"mount",
}
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
}
}
for _, tool := range s.CheckTools(runtimeRequiredTools) {
vendor := s.DetectGPUVendor()
for _, tool := range s.runtimeToolStatuses(vendor) {
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
Name: tool.Name,
Path: tool.Path,
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
})
}
lsmodText := commandText("lsmod")
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
health.CUDAReady = false
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
s.collectGPURuntimeHealth(vendor, &health)
if health.Status != "FAILED" && len(health.Issues) > 0 {
health.Status = "PARTIAL"
@@ -162,3 +128,87 @@ func commandText(name string, args ...string) string {
}
return string(raw)
}
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
tools := s.CheckTools(runtimeRequiredTools)
switch vendor {
case "nvidia":
tools = append(tools, s.CheckTools([]string{
"nvidia-smi",
"nvidia-bug-report.sh",
"bee-gpu-stress",
})...)
case "amd":
tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
tool.Path = cmd[0]
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
tool.Path = cmd[1]
}
tool.OK = true
}
tools = append(tools, tool)
}
return tools
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod")
switch vendor {
case "nvidia":
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_kernel_module_missing",
Severity: "warning",
Description: "NVIDIA kernel module is not loaded.",
})
}
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_modeset_failed",
Severity: "warning",
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
})
}
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
health.DriverReady = true
}
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
if err == nil {
health.CUDAReady = true
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "cuda_runtime_not_ready",
Severity: "warning",
Description: "CUDA runtime is not ready for GPU SAT.",
})
}
}
case "amd":
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "amdgpu_kernel_module_missing",
Severity: "warning",
Description: "AMD GPU driver is not loaded.",
})
}
out, err := runROCmSMI("--showproductname", "--csv")
if err == nil && strings.TrimSpace(string(out)) != "" {
health.CUDAReady = true
health.DriverReady = true
return
}
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "rocm_smi_unavailable",
Severity: "warning",
Description: "ROCm SMI is not available for AMD GPU SAT.",
})
}
}

View File

@@ -4,6 +4,7 @@ import (
"archive/tar"
"compress/gzip"
"context"
"errors"
"fmt"
"io"
"os"
@@ -15,6 +16,22 @@ import (
"time"
)
var (
satExecCommand = exec.Command
satLookPath = exec.LookPath
satGlob = filepath.Glob
satStat = os.Stat
rocmSMIExecutableGlobs = []string{
"/opt/rocm/bin/rocm-smi",
"/opt/rocm-*/bin/rocm-smi",
}
rocmSMIScriptGlobs = []string{
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
}
)
// NvidiaGPU holds basic GPU info from nvidia-smi.
type NvidiaGPU struct {
Index int
@@ -41,7 +58,7 @@ func (s *System) DetectGPUVendor() string {
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output()
out, err := runROCmSMI("--showproductname", "--csv")
if err != nil {
return nil, fmt.Errorf("rocm-smi: %w", err)
}
@@ -337,12 +354,22 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(cmd, " "),
"cmd: "+strings.Join(resolvedCmd, " "),
)
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
"rc: 1",
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
"",
)
return []byte(err.Error() + "\n"), err
}
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
if len(env) > 0 {
c.Env = append(os.Environ(), env...)
}
@@ -362,19 +389,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
}
func listStorageDevices() ([]string, error) {
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
if err != nil {
return nil, err
}
var devices []string
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) != 2 || fields[1] != "disk" {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices, nil
return parseStorageDevices(string(out)), nil
}
func storageSATCommands(devPath string) []satJob {
@@ -445,12 +464,22 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(cmd, " "),
"cmd: "+strings.Join(resolvedCmd, " "),
)
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
"rc: 1",
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
"",
)
return []byte(err.Error() + "\n"), err
}
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
rc := 0
if err != nil {
@@ -465,6 +494,91 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
return out, err
}
func runROCmSMI(args ...string) ([]byte, error) {
cmd, err := resolveROCmSMICommand(args...)
if err != nil {
return nil, err
}
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
}
func resolveSATCommand(cmd []string) ([]string, error) {
if len(cmd) == 0 {
return nil, errors.New("empty SAT command")
}
if cmd[0] != "rocm-smi" {
return cmd, nil
}
return resolveROCmSMICommand(cmd[1:]...)
}
func resolveROCmSMICommand(args ...string) ([]string, error) {
if path, err := satLookPath("rocm-smi"); err == nil {
return append([]string{path}, args...), nil
}
for _, path := range rocmSMIExecutableCandidates() {
return append([]string{path}, args...), nil
}
pythonPath, pyErr := satLookPath("python3")
if pyErr == nil {
for _, script := range rocmSMIScriptCandidates() {
cmd := []string{pythonPath, script}
cmd = append(cmd, args...)
return cmd, nil
}
}
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
}
func rocmSMIExecutableCandidates() []string {
return expandExistingPaths(rocmSMIExecutableGlobs)
}
func rocmSMIScriptCandidates() []string {
return expandExistingPaths(rocmSMIScriptGlobs)
}
func expandExistingPaths(patterns []string) []string {
seen := make(map[string]struct{})
var paths []string
for _, pattern := range patterns {
matches, err := satGlob(pattern)
if err != nil {
continue
}
sort.Strings(matches)
for _, match := range matches {
if _, err := satStat(match); err != nil {
continue
}
if _, ok := seen[match]; ok {
continue
}
seen[match] = struct{}{}
paths = append(paths, match)
}
}
return paths
}
func parseStorageDevices(raw string) []string {
var devices []string
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) < 2 || fields[1] != "disk" {
continue
}
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices
}
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {

View File

@@ -3,6 +3,8 @@ package platform
import (
"errors"
"os"
"os/exec"
"path/filepath"
"testing"
)
@@ -91,3 +93,90 @@ func TestClassifySATResult(t *testing.T) {
})
}
}
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
t.Parallel()
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
got := parseStorageDevices(raw)
want := []string{"/dev/nvme0n1", "/dev/sdb"}
if len(got) != len(want) {
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
}
}
}
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
t.Setenv("PATH", t.TempDir())
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
t.Fatalf("write rocm-smi: %v", err)
}
cmd, err := resolveROCmSMICommand("--showproductname")
if err != nil {
t.Fatalf("resolveROCmSMICommand error: %v", err)
}
if len(cmd) != 2 {
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
}
if cmd[0] != toolPath {
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
}
}
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
tmp := t.TempDir()
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
t.Fatalf("mkdir: %v", err)
}
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
t.Fatalf("write rocm-smi: %v", err)
}
oldGlob := rocmSMIExecutableGlobs
oldScriptGlobs := rocmSMIScriptGlobs
rocmSMIExecutableGlobs = []string{execPath}
rocmSMIScriptGlobs = nil
t.Cleanup(func() {
rocmSMIExecutableGlobs = oldGlob
rocmSMIScriptGlobs = oldScriptGlobs
})
t.Setenv("PATH", "")
cmd, err := resolveROCmSMICommand("--showallinfo")
if err != nil {
t.Fatalf("resolveROCmSMICommand error: %v", err)
}
if len(cmd) != 2 {
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
}
if cmd[0] != execPath {
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
}
}
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
oldLookPath := satLookPath
oldExecGlobs := rocmSMIExecutableGlobs
oldScriptGlobs := rocmSMIScriptGlobs
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
rocmSMIExecutableGlobs = nil
rocmSMIScriptGlobs = nil
t.Cleanup(func() {
satLookPath = oldLookPath
rocmSMIExecutableGlobs = oldExecGlobs
rocmSMIScriptGlobs = oldScriptGlobs
})
if _, err := runROCmSMI("--showproductname"); err == nil {
t.Fatal("expected missing rocm-smi error")
}
}

View File

@@ -24,15 +24,23 @@ var techDumpFixedCommands = []struct {
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
}
var techDumpNvidiaCommands = []struct {
Name string
Args []string
File string
}{
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
}
type lsblkDumpRoot struct {
Blockdevices []struct {
Name string `json:"name"`
Type string `json:"type"`
Tran string `json:"tran"`
} `json:"blockdevices"`
}
@@ -50,6 +58,15 @@ func (s *System) CaptureTechnicalDump(baseDir string) error {
for _, cmd := range techDumpFixedCommands {
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
}
switch s.DetectGPUVendor() {
case "nvidia":
for _, cmd := range techDumpNvidiaCommands {
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
}
case "amd":
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
}
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
@@ -69,6 +86,14 @@ func writeCommandDump(path, name string, args ...string) {
_ = os.WriteFile(path, out, 0644)
}
func writeROCmSMIDump(path string, args ...string) {
out, err := runROCmSMI(args...)
if err != nil && len(out) == 0 {
return
}
_ = os.WriteFile(path, out, 0644)
}
func lsblkDumpDevices(path string) []string {
raw, err := os.ReadFile(path)
if err != nil {
@@ -80,6 +105,9 @@ func lsblkDumpDevices(path string) []string {
}
var devices []string
for _, dev := range root.Blockdevices {
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
continue
}
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
devices = append(devices, strings.TrimSpace(dev.Name))
}

View File

@@ -12,12 +12,12 @@ func TestLSBLKDumpDevices(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "lsblk.json")
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk"}]}`), 0644); err != nil {
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
t.Fatalf("write lsblk fixture: %v", err)
}
got := lsblkDumpDevices(path)
want := []string{"nvme0n1", "sda"}
want := []string{"nvme0n1", "sdb"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
}

View File

@@ -32,6 +32,6 @@ lb config noauto \
--memtest none \
--iso-volume "EASY-BEE" \
--iso-application "EASY-BEE" \
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--bootappend-live "boot=live components console=ttyS0,115200n8 console=ttyS1,115200n8 loglevel=7 systemd.log_target=console systemd.journald.forward_to_console=1 systemd.journald.max_level_console=debug username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--apt-recommends false \
"${@}"

View File

@@ -34,6 +34,43 @@ mkdir -p "${CACHE_ROOT}"
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
export GOCACHE GOMODCACHE
resolve_audit_version() {
if [ -n "${BEE_AUDIT_VERSION:-}" ]; then
echo "${BEE_AUDIT_VERSION}"
return 0
fi
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
if [ -z "${tag}" ]; then
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v*' --abbrev=7 --dirty 2>/dev/null || true)"
fi
case "${tag}" in
audit/v*)
echo "${tag#audit/v}"
return 0
;;
v*)
echo "${tag#v}"
return 0
;;
"")
;;
*)
echo "${tag}"
return 0
;;
esac
if [ -n "${AUDIT_VERSION:-}" ]; then
echo "${AUDIT_VERSION}"
return 0
fi
date +%Y%m%d
}
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
# If headers for the detected ABI are not yet installed (kernel updated since image build),
# install them on the fly so NVIDIA modules and ISO kernel always match.
@@ -64,6 +101,7 @@ fi
echo "=== bee ISO build ==="
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}"
echo ""
echo "=== syncing git submodules ==="
@@ -83,7 +121,7 @@ if [ "$NEED_BUILD" = "1" ]; then
cd "${REPO_ROOT}/audit"
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
go build \
-ldflags "-s -w -X main.Version=${AUDIT_VERSION:-$(date +%Y%m%d)}" \
-ldflags "-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}" \
-o "$BEE_BIN" \
./cmd/bee
echo "binary: $BEE_BIN"
@@ -230,8 +268,8 @@ mkdir -p "${OVERLAY_STAGE_DIR}/etc"
BUILD_DATE="$(date +%Y-%m-%d)"
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
BEE_ISO_VERSION=${AUDIT_VERSION}
BEE_AUDIT_VERSION=${AUDIT_VERSION}
BEE_ISO_VERSION=${AUDIT_VERSION_EFFECTIVE}
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
BUILD_DATE=${BUILD_DATE}
GIT_COMMIT=${GIT_COMMIT}
DEBIAN_VERSION=${DEBIAN_VERSION}
@@ -272,7 +310,7 @@ lb build 2>&1
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${AUDIT_VERSION}-amd64.iso"
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${AUDIT_VERSION_EFFECTIVE}-amd64.iso"
if [ -f "$ISO_RAW" ]; then
cp "$ISO_RAW" "$ISO_OUT"
echo ""

View File

@@ -10,12 +10,17 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝
echo ""
menuentry "EASY-BEE" {
linux @KERNEL_LIVE@ @APPEND_LIVE@
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE (NVIDIA GSP=off)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE (fail-safe)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
initrd @INITRD_LIVE@
}

View File

@@ -0,0 +1,18 @@
label live-@FLAVOUR@-normal
menu label ^EASY-BEE
menu default
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=normal
label live-@FLAVOUR@-gsp-off
menu label EASY-BEE (^NVIDIA GSP=off)
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off
label live-@FLAVOUR@-failsafe
menu label EASY-BEE (^fail-safe)
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal

View File

@@ -5,6 +5,21 @@ set -e
echo "=== bee chroot setup ==="
ensure_bee_console_user() {
if id bee >/dev/null 2>&1; then
usermod -d /home/bee -s /bin/sh bee 2>/dev/null || true
else
useradd -d /home/bee -m -s /bin/sh -U bee
fi
mkdir -p /home/bee
chown -R bee:bee /home/bee
echo "bee:eeb" | chpasswd
usermod -aG sudo bee 2>/dev/null || true
}
ensure_bee_console_user
# Enable bee services
systemctl enable bee-network.service
systemctl enable bee-nvidia.service
@@ -15,6 +30,8 @@ systemctl enable bee-sshsetup.service
systemctl enable ssh.service
systemctl enable qemu-guest-agent.service 2>/dev/null || true
systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
# Ensure scripts are executable
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
@@ -23,6 +40,7 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
chmod +x /usr/local/bin/bee-tui 2>/dev/null || true
chmod +x /usr/local/bin/bee 2>/dev/null || true
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
# Reload udev rules
udevadm control --reload-rules 2>/dev/null || true

View File

@@ -8,14 +8,45 @@ set -e
ROCM_VERSION="6.4"
ROCM_KEYRING="/etc/apt/keyrings/rocm.gpg"
ROCM_LIST="/etc/apt/sources.list.d/rocm.list"
APT_UPDATED=0
echo "=== AMD ROCm ${ROCM_VERSION}: adding repository ==="
mkdir -p /etc/apt/keyrings
ensure_tool() {
tool="$1"
pkg="$2"
if command -v "${tool}" >/dev/null 2>&1; then
return 0
fi
if [ "${APT_UPDATED}" -eq 0 ]; then
apt-get update -qq
APT_UPDATED=1
fi
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}"
}
ensure_cert_bundle() {
if [ -s /etc/ssl/certs/ca-certificates.crt ]; then
return 0
fi
if [ "${APT_UPDATED}" -eq 0 ]; then
apt-get update -qq
APT_UPDATED=1
fi
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates
}
# live-build chroot may not include fetch/signing tools yet
if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then
echo "WARN: failed to install wget/gpg/ca-certificates prerequisites — skipping ROCm install"
exit 0
fi
# Download and import AMD GPG key
if ! wget -qO- "https://repo.radeon.com/rocm/rocm.gpg.key" \
| gpg --dearmor > "${ROCM_KEYRING}"; then
| gpg --dearmor --yes --output "${ROCM_KEYRING}"; then
echo "WARN: failed to fetch AMD ROCm GPG key — skipping ROCm install"
exit 0
fi
@@ -29,6 +60,14 @@ apt-get update -qq
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
echo "=== AMD ROCm: rocm-smi installed ==="
if [ -x /opt/rocm/bin/rocm-smi ]; then
ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
else
candidate="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
if [ -n "${candidate}" ]; then
ln -sf "${candidate}" /usr/local/bin/rocm-smi
fi
fi
rocm-smi --version 2>/dev/null || true
else
echo "WARN: rocm-smi-lib install failed — GPU monitoring unavailable"

View File

@@ -26,6 +26,15 @@ echo ""
KVER=$(uname -r)
info "kernel: $KVER"
NVIDIA_BOOT_MODE="normal"
for arg in $(cat /proc/cmdline 2>/dev/null); do
case "$arg" in
bee.nvidia.mode=*)
NVIDIA_BOOT_MODE="${arg#*=}"
;;
esac
done
info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
# --- PATH & binaries ---
echo "-- PATH & binaries --"
@@ -53,17 +62,25 @@ else
fail "NVIDIA ko dir missing: $KO_DIR"
fi
for mod in nvidia nvidia_modeset nvidia_uvm; do
if /sbin/lsmod 2>/dev/null | grep -q "^nvidia "; then
ok "module loaded: nvidia"
else
fail "module NOT loaded: nvidia"
fi
for mod in nvidia_modeset nvidia_uvm; do
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
ok "module loaded: $mod"
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
fail "module NOT loaded in normal mode: $mod"
else
fail "module NOT loaded: $mod"
warn "module not loaded in GSP-off mode: $mod"
fi
done
echo ""
echo "-- NVIDIA device nodes --"
for dev in nvidiactl nvidia0 nvidia-uvm; do
for dev in nvidiactl nvidia0; do
if [ -e "/dev/$dev" ]; then
ok "/dev/$dev exists"
else
@@ -71,6 +88,14 @@ for dev in nvidiactl nvidia0 nvidia-uvm; do
fi
done
if [ -e /dev/nvidia-uvm ]; then
ok "/dev/nvidia-uvm exists"
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
fail "/dev/nvidia-uvm missing in normal mode"
else
warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
fi
echo ""
echo "-- nvidia-smi --"
if PATH="/usr/local/bin:$PATH" command -v nvidia-smi >/dev/null 2>&1; then

View File

@@ -1,4 +1,4 @@
export PATH="$PATH:/usr/local/bin"
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
menu() {
if [ -x /usr/local/bin/bee-tui ]; then

View File

@@ -0,0 +1,4 @@
[Journal]
ForwardToConsole=yes
TTYPath=/dev/ttyS0
MaxLevelConsole=debug

View File

@@ -5,9 +5,9 @@ Before=bee-web.service
[Service]
Type=oneshot
ExecStart=/bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
StandardOutput=append:/appdata/bee/export/bee-audit.log
StandardError=append:/appdata/bee/export/bee-audit.log
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
StandardOutput=journal
StandardError=journal
RemainAfterExit=yes
[Install]

View File

@@ -0,0 +1,16 @@
[Unit]
Description=Bee: mirror system journal to %I
After=systemd-journald.service
Requires=systemd-journald.service
ConditionPathExists=/dev/%I
[Service]
Type=simple
ExecStart=/bin/sh -c 'exec journalctl -f -n 200 -o short-monotonic > /dev/%I'
Restart=always
RestartSec=1
StandardOutput=null
StandardError=journal
[Install]
WantedBy=multi-user.target

View File

@@ -5,9 +5,9 @@ Before=network-online.target bee-audit.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/bee-network.sh
StandardOutput=append:/appdata/bee/export/bee-network.log
StandardError=append:/appdata/bee/export/bee-network.log
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-network.log /usr/local/bin/bee-network.sh
StandardOutput=journal
StandardError=journal
RemainAfterExit=yes
[Install]

View File

@@ -5,9 +5,9 @@ Before=bee-audit.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/bee-nvidia-load
StandardOutput=append:/appdata/bee/export/bee-nvidia.log
StandardError=append:/appdata/bee/export/bee-nvidia.log
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-nvidia.log /usr/local/bin/bee-nvidia-load
StandardOutput=journal
StandardError=journal
RemainAfterExit=yes
[Install]

View File

@@ -5,9 +5,9 @@ Before=bee-audit.service
[Service]
Type=oneshot
ExecStart=/bin/sh -c '/usr/local/bin/bee preflight --output file:/appdata/bee/export/runtime-health.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-preflight] WARN: preflight exited with rc=$rc"; fi; exit 0'
StandardOutput=append:/appdata/bee/export/runtime-health.log
StandardError=append:/appdata/bee/export/runtime-health.log
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/runtime-health.log /bin/sh -c '/usr/local/bin/bee preflight --output file:/appdata/bee/export/runtime-health.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-preflight] WARN: preflight exited with rc=$rc"; fi; exit 0'
StandardOutput=journal
StandardError=journal
RemainAfterExit=yes
[Install]

View File

@@ -5,9 +5,9 @@ Before=ssh.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/bee-sshsetup
StandardOutput=append:/appdata/bee/export/bee-sshsetup.log
StandardError=append:/appdata/bee/export/bee-sshsetup.log
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-sshsetup.log /usr/local/bin/bee-sshsetup
StandardOutput=journal
StandardError=journal
RemainAfterExit=yes
[Install]

View File

@@ -5,11 +5,11 @@ Wants=bee-audit.service
[Service]
Type=simple
ExecStart=/usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
Restart=always
RestartSec=2
StandardOutput=append:/appdata/bee/export/bee-web.log
StandardError=append:/appdata/bee/export/bee-web.log
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,29 @@
#!/bin/bash
# bee-log-run — run a command, append its output to a file, and keep stdout/stderr
# connected to systemd so journald and the serial console also receive the logs.
set -o pipefail
log_file="$1"
shift
if [ -z "$log_file" ] || [ "$#" -eq 0 ]; then
echo "usage: $0 <log-file> <command> [args...]" >&2
exit 2
fi
mkdir -p "$(dirname "$log_file")"
serial_sink() {
local tty="$1"
if [ -w "$tty" ]; then
cat > "$tty"
else
cat > /dev/null
fi
}
"$@" 2>&1 | tee -a "$log_file" \
>(serial_sink /dev/ttyS0) \
>(serial_sink /dev/ttyS1)
exit "${PIPESTATUS[0]}"

View File

@@ -22,24 +22,61 @@ fi
log "module dir: $NVIDIA_KO_DIR"
ls "$NVIDIA_KO_DIR"/*.ko 2>/dev/null | sed 's/^/ /' || true
# Some kernels expose backlight helper symbols only after loading `video`.
modprobe video >/dev/null 2>&1 && log "loaded helper module: video" || log "helper module unavailable: video"
cmdline_param() {
key="$1"
for token in $(cat /proc/cmdline 2>/dev/null); do
case "$token" in
"$key"=*)
echo "${token#*=}"
return 0
;;
esac
done
return 1
}
# Load modules via insmod (direct load — no depmod needed)
for mod in nvidia nvidia-modeset nvidia-uvm; do
nvidia_mode="$(cmdline_param bee.nvidia.mode || true)"
if [ -z "$nvidia_mode" ]; then
nvidia_mode="normal"
fi
log "boot mode: $nvidia_mode"
load_module() {
mod="$1"
shift
ko="$NVIDIA_KO_DIR/${mod}.ko"
[ -f "$ko" ] || ko="$NVIDIA_KO_DIR/${mod//-/_}.ko"
if [ -f "$ko" ]; then
if insmod "$ko"; then
log "loaded: $mod"
else
log "WARN: failed to load: $mod"
dmesg | tail -n 5 | sed 's/^/ dmesg: /' || true
fi
else
if [ ! -f "$ko" ]; then
log "WARN: not found: $ko"
return 1
fi
done
if insmod "$ko" "$@"; then
log "loaded: $mod $*"
return 0
fi
log "WARN: failed to load: $mod"
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
return 1
}
case "$nvidia_mode" in
normal|full)
if ! load_module nvidia; then
exit 1
fi
load_module nvidia-modeset || true
load_module nvidia-uvm || true
;;
gsp-off|safe|*)
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
# conservative path for platforms where full boot-time GSP init is unstable.
if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
exit 1
fi
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
;;
esac
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
@@ -61,8 +98,6 @@ if [ -n "$uvm_major" ]; then
&& log "created /dev/nvidia-uvm (major $uvm_major)" \
|| log "WARN: /dev/nvidia-uvm already exists"
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true
else
log "WARN: nvidia-uvm not in /proc/devices"
fi
log "done"