iso: improve burn-in, export, and live boot

This commit is contained in:
Mikhail Chusavitin
2026-03-26 18:56:19 +03:00
parent 67a215c66f
commit fc5c2019aa
23 changed files with 1706 additions and 168 deletions

View File

@@ -231,8 +231,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) { func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportLatestAudit(target) path, err := a.ExportLatestAudit(target)
body := "Audit exported." body := "Audit export failed."
if path != "" { if err == nil {
body = "Audit exported."
}
if err == nil && path != "" {
body = "Audit exported to " + path body = "Audit exported to " + path
} }
return ActionResult{Title: "Export audit", Body: body}, err return ActionResult{Title: "Export audit", Body: body}, err
@@ -249,8 +252,11 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) { func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportSupportBundle(target) path, err := a.ExportSupportBundle(target)
body := "Support bundle exported. USB target unmounted and safe to remove." body := "Support bundle export failed."
if path != "" { if err == nil {
body = "Support bundle exported. USB target unmounted and safe to remove."
}
if err == nil && path != "" {
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove." body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
} }
return ActionResult{Title: "Export support bundle", Body: body}, err return ActionResult{Title: "Export support bundle", Body: body}, err

View File

@@ -470,6 +470,41 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
} }
} }
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
oldExportDir := DefaultExportDir
DefaultExportDir = tmp
t.Cleanup(func() { DefaultExportDir = oldExportDir })
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
t.Fatalf("write bee-audit.json: %v", err)
}
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
t.Fatalf("write bee-audit.log: %v", err)
}
a := &App{
exports: fakeExports{
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
},
},
}
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
if err == nil {
t.Fatal("expected export error")
}
if contains(result.Body, "exported to") {
t.Fatalf("body should not claim success:\n%s", result.Body)
}
if result.Body != "Support bundle export failed." {
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
}
}
func TestRunNvidiaAcceptancePackResult(t *testing.T) { func TestRunNvidiaAcceptancePackResult(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -11,8 +11,48 @@ import (
var exportExecCommand = exec.Command var exportExecCommand = exec.Command
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
msg := strings.TrimSpace(raw)
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
}
if msg == "" {
return err
}
return fmt.Errorf("%s: %w", msg, err)
}
func removableTargetReadOnly(fields map[string]string) bool {
if fields["RO"] == "1" {
return true
}
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
case "iso9660", "squashfs":
return true
default:
return false
}
}
func ensureWritableMountpoint(mountpoint string) error {
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
if err != nil {
return fmt.Errorf("target filesystem is not writable: %w", err)
}
name := probe.Name()
if closeErr := probe.Close(); closeErr != nil {
_ = os.Remove(name)
return closeErr
}
if err := os.Remove(name); err != nil {
return err
}
return nil
}
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) { func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output() raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -36,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
} }
} }
} }
if !removable || fields["FSTYPE"] == "" { if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
continue continue
} }
@@ -72,7 +112,7 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
} }
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil { if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
_ = os.Remove(mountpoint) _ = os.Remove(mountpoint)
return string(raw), err return "", formatMountTargetError(target, string(raw), err)
} }
mountedHere = true mountedHere = true
mounted = true mounted = true
@@ -95,6 +135,10 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
} }
}() }()
if err := ensureWritableMountpoint(mountpoint); err != nil {
return "", err
}
filename := filepath.Base(src) filename := filepath.Base(src)
dst = filepath.Join(mountpoint, filename) dst = filepath.Join(mountpoint, filename)
data, err := os.ReadFile(src) data, err := os.ReadFile(src)

View File

@@ -4,12 +4,11 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"strings"
"testing" "testing"
) )
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) { func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Parallel()
tmp := t.TempDir() tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz") src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt") mountpoint := filepath.Join(tmp, "mnt")
@@ -54,3 +53,60 @@ func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls) t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
} }
} }
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt")
if err := os.MkdirAll(mountpoint, 0755); err != nil {
t.Fatalf("mkdir mountpoint: %v", err)
}
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
t.Fatalf("write src: %v", err)
}
if err := os.Chmod(mountpoint, 0555); err != nil {
t.Fatalf("chmod mountpoint: %v", err)
}
oldExec := exportExecCommand
exportExecCommand = func(name string, args ...string) *exec.Cmd {
return exec.Command("sh", "-c", "exit 0")
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
_, err := s.ExportFileToTarget(src, RemovableTarget{
Device: "/dev/sdb1",
Mountpoint: mountpoint,
})
if err == nil {
t.Fatal("expected error for non-writable mountpoint")
}
if !strings.Contains(err.Error(), "target filesystem is not writable") {
t.Fatalf("err=%q want writable message", err)
}
}
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
oldExec := exportExecCommand
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
exportExecCommand = func(name string, args ...string) *exec.Cmd {
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
return cmd
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
targets, err := s.ListRemovableTargets()
if err != nil {
t.Fatalf("ListRemovableTargets error: %v", err)
}
if len(targets) != 1 {
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
}
if got := targets[0].Device; got != "/dev/sdb1" {
t.Fatalf("device=%q want /dev/sdb1", got)
}
}

View File

@@ -151,8 +151,10 @@ func (m model) confirmCancelTarget() screen {
switch m.pendingAction { switch m.pendingAction {
case actionExportBundle: case actionExportBundle:
return screenExportTargets return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress: case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
return screenHealthCheck return screenHealthCheck
case actionRunFanStress:
return screenBurnInTests
default: default:
return screenMain return screenMain
} }
@@ -165,9 +167,9 @@ func hcFanStressOpts(hcMode int, application interface {
// Phase durations per mode: [baseline, load1, pause, load2] // Phase durations per mode: [baseline, load1, pause, load2]
type durations struct{ baseline, load1, pause, load2 int } type durations struct{ baseline, load1, pause, load2 int }
modes := [3]durations{ modes := [3]durations{
{30, 120, 30, 120}, // Quick: ~5 min total {30, 120, 30, 120}, // Quick: ~5 min total
{60, 300, 60, 300}, // Standard: ~12 min total {60, 300, 60, 300}, // Standard: ~12 min total
{60, 600, 120, 600}, // Express: ~24 min total {60, 600, 120, 600}, // Express: ~24 min total
} }
if hcMode < 0 || hcMode >= len(modes) { if hcMode < 0 || hcMode >= len(modes) {
hcMode = 0 hcMode = 0

View File

@@ -0,0 +1,117 @@
package tui
import (
"fmt"
"strings"
tea "github.com/charmbracelet/bubbletea"
)
const (
burnCurGPUStress = 0
burnCurModeQuick = 1
burnCurModeStd = 2
burnCurModeExpr = 3
burnCurRun = 4
burnCurTotal = 5
)
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
m.screen = screenBurnInTests
m.cursor = 0
if !m.burnInitialized {
m.burnMode = 0
m.burnCursor = 0
m.burnInitialized = true
}
return m, nil
}
func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
switch msg.String() {
case "up", "k":
if m.burnCursor > 0 {
m.burnCursor--
}
case "down", "j":
if m.burnCursor < burnCurTotal-1 {
m.burnCursor++
}
case " ":
switch m.burnCursor {
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
m.burnMode = m.burnCursor - burnCurModeQuick
}
case "enter":
switch m.burnCursor {
case burnCurGPUStress, burnCurRun:
return m.burnRunSelected()
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
m.burnMode = m.burnCursor - burnCurModeQuick
}
case "f", "F", "r", "R":
return m.burnRunSelected()
case "1":
m.burnMode = 0
case "2":
m.burnMode = 1
case "3":
m.burnMode = 2
case "esc":
m.screen = screenMain
m.cursor = 1
case "q", "ctrl+c":
return m, tea.Quit
}
return m, nil
}
func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
return m.hcRunFanStress()
}
func renderBurnInTests(m model) string {
var b strings.Builder
fmt.Fprintln(&b, "BURN-IN TESTS")
fmt.Fprintln(&b)
fmt.Fprintln(&b, " Stress tests:")
fmt.Fprintln(&b)
pfx := " "
if m.burnCursor == burnCurGPUStress {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
fmt.Fprintln(&b)
fmt.Fprintln(&b, " Mode:")
modes := []struct{ label, key string }{
{"Quick", "1"},
{"Standard", "2"},
{"Express", "3"},
}
for i, mode := range modes {
pfx := " "
if m.burnCursor == burnCurModeQuick+i {
pfx = "> "
}
radio := "( )"
if m.burnMode == i {
radio = "(*)"
}
fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key)
}
fmt.Fprintln(&b)
pfx = " "
if m.burnCursor == burnCurRun {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back")
return b.String()
}

View File

@@ -4,7 +4,12 @@ import tea "github.com/charmbracelet/bubbletea"
func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) { func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
if len(m.targets) == 0 { if len(m.targets) == 0 {
return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain) return m, resultCmd(
"Export support bundle",
"No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list.",
nil,
screenMain,
)
} }
target := m.targets[m.cursor] target := m.targets[m.cursor]
m.selectedTarget = &target m.selectedTarget = &target

View File

@@ -21,17 +21,16 @@ const (
// Cursor positions in Health Check screen. // Cursor positions in Health Check screen.
const ( const (
hcCurGPU = 0 hcCurGPU = 0
hcCurMemory = 1 hcCurMemory = 1
hcCurStorage = 2 hcCurStorage = 2
hcCurCPU = 3 hcCurCPU = 3
hcCurSelectAll = 4 hcCurSelectAll = 4
hcCurModeQuick = 5 hcCurModeQuick = 5
hcCurModeStd = 6 hcCurModeStd = 6
hcCurModeExpr = 7 hcCurModeExpr = 7
hcCurRunAll = 8 hcCurRunAll = 8
hcCurFanStress = 9 hcCurTotal = 9
hcCurTotal = 10
) )
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds. // hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
@@ -86,8 +85,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
m.hcMode = m.hcCursor - hcCurModeQuick m.hcMode = m.hcCursor - hcCurModeQuick
case hcCurRunAll: case hcCurRunAll:
return m.hcRunAll() return m.hcRunAll()
case hcCurFanStress:
return m.hcRunFanStress()
} }
case "g", "G": case "g", "G":
return m.hcRunSingle(hcGPU) return m.hcRunSingle(hcGPU)
@@ -99,8 +96,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.hcRunSingle(hcCPU) return m.hcRunSingle(hcCPU)
case "r", "R": case "r", "R":
return m.hcRunAll() return m.hcRunAll()
case "f", "F":
return m.hcRunFanStress()
case "a", "A": case "a", "A":
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3] allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
for i := range m.hcSel { for i := range m.hcSel {
@@ -160,7 +155,7 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart. // startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) { func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
opts := hcFanStressOpts(m.hcMode, m.app) opts := hcFanStressOpts(m.burnMode, m.app)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
m.gpuStressCancel = cancel m.gpuStressCancel = cancel
@@ -197,7 +192,8 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
m.gpuStressCancel = nil m.gpuStressCancel = nil
} }
m.gpuStressAborted = true m.gpuStressAborted = true
m.screen = screenHealthCheck m.screen = screenBurnInTests
m.burnCursor = burnCurGPUStress
m.cursor = 0 m.cursor = 0
case "ctrl+c": case "ctrl+c":
return m, tea.Quit return m, tea.Quit
@@ -380,16 +376,8 @@ func renderHealthCheck(m model) string {
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx) fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
} }
{
pfx := " "
if m.hcCursor == hcCurFanStress {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
}
fmt.Fprintln(&b) fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────") fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] gpu stress [Esc] back") fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
return b.String() return b.String()
} }

View File

@@ -8,7 +8,9 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
switch m.cursor { switch m.cursor {
case 0: // Health Check case 0: // Health Check
return m.enterHealthCheck() return m.enterHealthCheck()
case 1: // Export support bundle case 1: // Burn-in tests
return m.enterBurnInTests()
case 2: // Export support bundle
m.pendingAction = actionExportBundle m.pendingAction = actionExportBundle
m.busy = true m.busy = true
m.busyTitle = "Export support bundle" m.busyTitle = "Export support bundle"
@@ -16,11 +18,11 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
targets, err := m.app.ListRemovableTargets() targets, err := m.app.ListRemovableTargets()
return exportTargetsMsg{targets: targets, err: err} return exportTargetsMsg{targets: targets, err: err}
} }
case 2: // Settings case 3: // Settings
m.screen = screenSettings m.screen = screenSettings
m.cursor = 0 m.cursor = 0
return m, nil return m, nil
case 3: // Exit case 4: // Exit
return m, tea.Quit return m, tea.Quit
} }
return m, nil return m, nil

View File

@@ -54,9 +54,10 @@ func TestUpdateMainMenuEnterActions(t *testing.T) {
wantCmd bool wantCmd bool
}{ }{
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true}, {name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true},
{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true}, {name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests, wantCmd: true},
{name: "settings", cursor: 2, wantScreen: screenSettings, wantCmd: true}, {name: "export", cursor: 2, wantScreen: screenMain, wantBusy: true, wantCmd: true},
{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true}, {name: "settings", cursor: 3, wantScreen: screenSettings, wantCmd: true},
{name: "exit", cursor: 4, wantScreen: screenMain, wantCmd: true},
} }
for _, test := range tests { for _, test := range tests {
@@ -115,7 +116,8 @@ func TestMainMenuSimpleTransitions(t *testing.T) {
wantScreen screen wantScreen screen
}{ }{
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck}, {name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
{name: "settings", cursor: 2, wantScreen: screenSettings}, {name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests},
{name: "settings", cursor: 3, wantScreen: screenSettings},
} }
for _, test := range tests { for _, test := range tests {
@@ -146,7 +148,7 @@ func TestMainMenuExportSetsBusy(t *testing.T) {
t.Parallel() t.Parallel()
m := newTestModel() m := newTestModel()
m.cursor = 1 // Export support bundle m.cursor = 2 // Export support bundle
next, cmd := m.handleMainMenu() next, cmd := m.handleMainMenu()
got := next.(model) got := next.(model)
@@ -163,12 +165,13 @@ func TestMainViewRendersTwoColumns(t *testing.T) {
t.Parallel() t.Parallel()
m := newTestModel() m := newTestModel()
m.cursor = 1 m.cursor = 2
view := m.View() view := m.View()
for _, want := range []string{ for _, want := range []string{
"bee", "bee",
"Health Check", "Health Check",
"Burn-in tests",
"> Export support bundle", "> Export support bundle",
"Settings", "Settings",
"Exit", "Exit",
@@ -400,6 +403,11 @@ func TestConfirmCancelTarget(t *testing.T) {
t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck) t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
} }
m.pendingAction = actionRunFanStress
if got := m.confirmCancelTarget(); got != screenBurnInTests {
t.Fatalf("fan stress cancel target=%q want %q", got, screenBurnInTests)
}
m.pendingAction = actionNone m.pendingAction = actionNone
if got := m.confirmCancelTarget(); got != screenMain { if got := m.confirmCancelTarget(); got != screenMain {
t.Fatalf("default cancel target=%q want %q", got, screenMain) t.Fatalf("default cancel target=%q want %q", got, screenMain)
@@ -439,6 +447,68 @@ func TestViewBusyStateUsesBusyTitle(t *testing.T) {
} }
} }
func TestBurnInTestsEscReturnsToMain(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
m.burnCursor = 3
next, _ := m.updateBurnInTests(tea.KeyMsg{Type: tea.KeyEsc})
got := next.(model)
if got.screen != screenMain {
t.Fatalf("screen=%q want %q", got.screen, screenMain)
}
if got.cursor != 1 {
t.Fatalf("cursor=%d want 1", got.cursor)
}
}
func TestBurnInTestsRunOpensConfirm(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
m.burnInitialized = true
m.burnMode = 2
next, _ := m.burnRunSelected()
got := next.(model)
if got.screen != screenConfirm {
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
}
if got.pendingAction != actionRunFanStress {
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunFanStress)
}
if got.cursor != 0 {
t.Fatalf("cursor=%d want 0", got.cursor)
}
}
func TestViewBurnInTestsRendersGPUStressEntry(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
view := m.View()
for _, want := range []string{
"BURN-IN TESTS",
"GPU PLATFORM STRESS TEST",
"Quick",
"Standard",
"Express",
"[ RUN SELECTED [R] ]",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) { func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
t.Parallel() t.Parallel()
@@ -528,7 +598,7 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
for _, want := range []string{ for _, want := range []string{
"Export support bundle", "Export support bundle",
"Select removable filesystem", "Select writable removable filesystem (read-only/boot media hidden)",
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee", "> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
} { } {
if !strings.Contains(view, want) { if !strings.Contains(view, want) {
@@ -537,6 +607,32 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
} }
} }
func TestExportTargetsMsgEmptyShowsHiddenBootMediaHint(t *testing.T) {
t.Parallel()
m := newTestModel()
m.busy = true
m.busyTitle = "Export support bundle"
next, _ := m.Update(exportTargetsMsg{})
got := next.(model)
if got.screen != screenOutput {
t.Fatalf("screen=%q want %q", got.screen, screenOutput)
}
if got.title != "Export support bundle" {
t.Fatalf("title=%q want %q", got.title, "Export support bundle")
}
for _, want := range []string{
"No writable removable filesystems found.",
"Read-only or boot media are hidden from this list.",
} {
if !strings.Contains(got.body, want) {
t.Fatalf("body missing %q\nbody:\n%s", want, got.body)
}
}
}
func TestViewStaticFormRendersFields(t *testing.T) { func TestViewStaticFormRendersFields(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -16,6 +16,7 @@ type screen string
const ( const (
screenMain screen = "main" screenMain screen = "main"
screenHealthCheck screen = "health_check" screenHealthCheck screen = "health_check"
screenBurnInTests screen = "burn_in_tests"
screenSettings screen = "settings" screenSettings screen = "settings"
screenNetwork screen = "network" screenNetwork screen = "network"
screenInterfacePick screen = "interface_pick" screenInterfacePick screen = "interface_pick"
@@ -41,8 +42,8 @@ const (
actionRunMemorySAT actionKind = "run_memory_sat" actionRunMemorySAT actionKind = "run_memory_sat"
actionRunStorageSAT actionKind = "run_storage_sat" actionRunStorageSAT actionKind = "run_storage_sat"
actionRunCPUSAT actionKind = "run_cpu_sat" actionRunCPUSAT actionKind = "run_cpu_sat"
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat" actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
actionRunFanStress actionKind = "run_fan_stress" actionRunFanStress actionKind = "run_fan_stress"
) )
type model struct { type model struct {
@@ -84,6 +85,11 @@ type model struct {
hcCursor int hcCursor int
hcInitialized bool hcInitialized bool
// Burn-in tests screen
burnMode int
burnCursor int
burnInitialized bool
// NVIDIA SAT setup // NVIDIA SAT setup
nvidiaGPUs []platform.NvidiaGPU nvidiaGPUs []platform.NvidiaGPU
nvidiaGPUSel []bool nvidiaGPUSel []bool
@@ -97,9 +103,9 @@ type model struct {
// GPU Platform Stress Test running // GPU Platform Stress Test running
gpuStressCancel func() gpuStressCancel func()
gpuStressAborted bool gpuStressAborted bool
gpuLiveRows []platform.GPUMetricRow gpuLiveRows []platform.GPUMetricRow
gpuLiveIndices []int gpuLiveIndices []int
gpuLiveStart time.Time gpuLiveStart time.Time
// SAT verbose progress (CPU / Memory / Storage / AMD GPU) // SAT verbose progress (CPU / Memory / Storage / AMD GPU)
progressLines []string progressLines []string
@@ -132,6 +138,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
screen: screenMain, screen: screenMain,
mainMenu: []string{ mainMenu: []string{
"Health Check", "Health Check",
"Burn-in tests",
"Export support bundle", "Export support bundle",
"Settings", "Settings",
"Exit", "Exit",
@@ -201,7 +208,7 @@ func (m model) confirmBody() (string, string) {
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"} modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" + return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
"Monitors fans, temps, power — detects throttling.\n" + "Monitors fans, temps, power — detects throttling.\n" +
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed." "Mode: " + modes[m.burnMode] + "\n\nAll NVIDIA GPUs will be stressed."
default: default:
return "Confirm", "Proceed?" return "Confirm", "Proceed?"
} }

View File

@@ -101,6 +101,13 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.screen = screenOutput m.screen = screenOutput
return m, m.refreshSnapshotCmd() return m, m.refreshSnapshotCmd()
} }
if len(msg.targets) == 0 {
m.title = "Export support bundle"
m.body = "No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list."
m.prevScreen = screenMain
m.screen = screenOutput
return m, m.refreshSnapshotCmd()
}
m.targets = msg.targets m.targets = msg.targets
m.screen = screenExportTargets m.screen = screenExportTargets
m.cursor = 0 m.cursor = 0
@@ -117,7 +124,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.gpuStressCancel() m.gpuStressCancel()
m.gpuStressCancel = nil m.gpuStressCancel = nil
} }
m.prevScreen = screenHealthCheck m.prevScreen = screenBurnInTests
m.screen = screenOutput m.screen = screenOutput
m.title = msg.title m.title = msg.title
if msg.err != nil { if msg.err != nil {
@@ -179,6 +186,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.updateMain(msg) return m.updateMain(msg)
case screenHealthCheck: case screenHealthCheck:
return m.updateHealthCheck(msg) return m.updateHealthCheck(msg)
case screenBurnInTests:
return m.updateBurnInTests(msg)
case screenSettings: case screenSettings:
return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu) return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
case screenNetwork: case screenNetwork:

View File

@@ -57,6 +57,8 @@ func (m model) View() string {
body = renderTwoColumnMain(m) body = renderTwoColumnMain(m)
case screenHealthCheck: case screenHealthCheck:
body = renderHealthCheck(m) body = renderHealthCheck(m)
case screenBurnInTests:
body = renderBurnInTests(m)
case screenSettings: case screenSettings:
body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor) body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
case screenNetwork: case screenNetwork:
@@ -66,7 +68,12 @@ func (m model) View() string {
case screenServiceAction: case screenServiceAction:
body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor) body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
case screenExportTargets: case screenExportTargets:
body = renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor) body = renderMenu(
"Export support bundle",
"Select writable removable filesystem (read-only/boot media hidden)",
renderTargetItems(m.targets),
m.cursor,
)
case screenInterfacePick: case screenInterfacePick:
body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor) body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
case screenStaticForm: case screenStaticForm:

View File

@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.
## Boot sequence (single ISO) ## Boot sequence (single ISO)
The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
`systemd` boot order: `systemd` boot order:
``` ```
@@ -25,6 +27,7 @@ local-fs.target
``` ```
**Critical invariants:** **Critical invariants:**
- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`. - OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking. - `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`. - `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
@@ -71,24 +74,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
d. build kernel modules against Debian headers d. build kernel modules against Debian headers
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
f. cache in `dist/nvidia-<version>-<kver>/` f. cache in `dist/nvidia-<version>-<kver>/`
7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/` 7. `build-cublas.sh`:
8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi` a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/` b. verify packages against repo `Packages.gz`
10. write staged `/etc/bee-release` (versions + git commit) c. extract headers for `bee-gpu-stress` build
11. patch staged `motd` with build metadata d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
12. copy `iso/builder/` into a temporary live-build workdir under `dist/` 8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
13. sync staged overlay into workdir `config/includes.chroot/` 9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
14. run `lb config && lb build` inside the privileged builder container 10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
12. write staged `/etc/bee-release` (versions + git commit)
13. patch staged `motd` with build metadata
14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
15. sync staged overlay into workdir `config/includes.chroot/`
16. run `lb config && lb build` inside the privileged builder container
``` ```
Build host notes:
- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
**Critical invariants:** **Critical invariants:**
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places: - `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build 1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
2. `auto/config``linux-image-${DEBIAN_KERNEL_ABI}` in the ISO 2. `auto/config``linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`. - NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay. - The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean. - The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly. - Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
## Post-boot smoke test ## Post-boot smoke test
@@ -131,10 +149,15 @@ Current validation state:
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal. Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
Acceptance flows: Acceptance flows:
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress` - `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
- `bee sat memory``memtester` archive - `bee sat memory``memtester` archive
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`) - SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
- Ada / Hopper: add `fp8`
- Blackwell+: add `fp4`
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
- Runtime overrides: - Runtime overrides:
- `BEE_GPU_STRESS_SECONDS` - `BEE_GPU_STRESS_SECONDS`
- `BEE_GPU_STRESS_SIZE_MB` - `BEE_GPU_STRESS_SIZE_MB`

View File

@@ -21,7 +21,8 @@ Fills gaps where Redfish/logpile is blind:
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
- Machine-readable health summary derived from collector verdicts - Machine-readable health summary derived from collector verdicts
- Operator-triggered acceptance tests for NVIDIA, memory, and storage - Operator-triggered acceptance tests for NVIDIA, memory, and storage
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress` - NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
- Automatic boot audit with operator-facing local console and SSH access - Automatic boot audit with operator-facing local console and SSH access
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi` - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
- SSH access (OpenSSH) always available for inspection and debugging - SSH access (OpenSSH) always available for inspection and debugging
@@ -69,6 +70,7 @@ Fills gaps where Redfish/logpile is blind:
| SSH | OpenSSH server | | SSH | OpenSSH server |
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers | | NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` | | NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
| Builder | Debian 12 host/VM or Debian 12 container image | | Builder | Debian 12 host/VM or Debian 12 container image |
## Operator UX ## Operator UX
@@ -78,6 +80,7 @@ Fills gaps where Redfish/logpile is blind:
- The TUI itself executes privileged actions as `root` via `sudo -n` - The TUI itself executes privileged actions as `root` via `sudo -n`
- SSH remains available independently of the local console path - SSH remains available independently of the local console path
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging - VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
## Runtime split ## Runtime split
@@ -85,6 +88,7 @@ Fills gaps where Redfish/logpile is blind:
- Live-ISO-only responsibilities stay in `iso/` integration code - Live-ISO-only responsibilities stay in `iso/` integration code
- Live ISO launches the Go CLI with `--runtime livecd` - Live ISO launches the Go CLI with `--runtime livecd`
- Local/manual runs use `--runtime auto` or `--runtime local` - Local/manual runs use `--runtime auto` or `--runtime local`
- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
## Key paths ## Key paths

58
iso/README.md Normal file
View File

@@ -0,0 +1,58 @@
# ISO Build
`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
## Requirements
- Docker Desktop or another Docker-compatible container runtime
- Privileged containers enabled
- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
## Build On macOS
From the repository root:
```sh
sh iso/builder/build-in-container.sh
```
The script defaults to `linux/amd64` builder containers, so it works on:
- Intel Mac
- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
You do not need to pass `--platform` manually for normal ISO builds.
## Useful Options
Build with explicit SSH keys baked into the ISO:
```sh
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
```
Rebuild the builder image:
```sh
sh iso/builder/build-in-container.sh --rebuild-image
```
Use a custom cache directory:
```sh
sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
```
## Notes
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
- Override the container platform only if you know why:
```sh
BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
```
- The shipped ISO is still `amd64`.
- Output ISO artifacts are written under `dist/`.

View File

@@ -4,5 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
NCCL_VERSION=2.28.9-1 NCCL_VERSION=2.28.9-1
NCCL_CUDA_VERSION=13.0 NCCL_CUDA_VERSION=13.0
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
CUBLAS_VERSION=13.0.2.14-1
CUDA_USERSPACE_VERSION=13.0.96-1
GO_VERSION=1.24.0 GO_VERSION=1.24.0
AUDIT_VERSION=1.0.0 AUDIT_VERSION=1.0.0

View File

@@ -32,6 +32,6 @@ lb config noauto \
--memtest none \ --memtest none \
--iso-volume "EASY-BEE" \ --iso-volume "EASY-BEE" \
--iso-application "EASY-BEE" \ --iso-application "EASY-BEE" \
--bootappend-live "boot=live components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --bootappend-live "boot=live toram components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--apt-recommends false \ --apt-recommends false \
"${@}" "${@}"

File diff suppressed because it is too large Load Diff

170
iso/builder/build-cublas.sh Normal file
View File

@@ -0,0 +1,170 @@
#!/bin/sh
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
#
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
# verifies them against Packages.gz, and extracts the small subset we need:
# - headers for compiling bee-gpu-stress against cuBLASLt
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
set -e
CUBLAS_VERSION="$1"
CUDA_USERSPACE_VERSION="$2"
CUDA_SERIES="$3"
DIST_DIR="$4"
[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$CUDA_SERIES" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
&& [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
echo "=== cuBLAS cached, skipping download ==="
echo "cache: $CACHE_DIR"
exit 0
fi
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
echo "=== downloading Packages.gz ==="
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
lookup_pkg() {
pkg="$1"
ver="$2"
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
/^Package: / { cur_pkg=$2 }
/^Version: / { cur_ver=$2 }
/^Filename: / { cur_file=$2 }
/^SHA256: / { cur_sha=$2 }
/^$/ {
if (cur_pkg == pkg && cur_ver == ver) {
print cur_file " " cur_sha
exit
}
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
}
END {
if (cur_pkg == pkg && cur_ver == ver) {
print cur_file " " cur_sha
}
}'
}
download_verified_pkg() {
pkg="$1"
ver="$2"
meta="$(lookup_pkg "$pkg" "$ver")"
[ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
[ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
[ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
if [ -f "$out" ]; then
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
if [ "$actual_sha" = "$repo_sha" ]; then
echo "=== using cached $(basename "$repo_file") ==="
printf '%s\n' "$out"
return 0
fi
echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ==="
rm -f "$out"
fi
echo "=== downloading $(basename "$repo_file") ==="
wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
if [ "$actual_sha" != "$repo_sha" ]; then
echo "ERROR: sha256 mismatch for $(basename "$repo_file")"
echo " expected: $repo_sha"
echo " actual: $actual_sha"
rm -f "$out"
exit 1
fi
echo "sha256 OK: $(basename "$repo_file")"
printf '%s\n' "$out"
}
extract_deb() {
deb="$1"
dst="$2"
mkdir -p "$dst"
(
cd "$dst"
ar x "$deb"
data_tar=$(ls data.tar.* 2>/dev/null | head -1)
[ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
tar xf "$data_tar"
)
}
copy_headers() {
from="$1"
if [ -d "${from}/usr/include" ]; then
cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
fi
}
copy_libs() {
from="$1"
find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
\( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
}
make_links() {
base="$1"
versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
[ -n "$versioned" ] || return 0
soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
target=$(basename "$versioned")
ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
}
TMP_DIR=$(mktemp -d)
trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
copy_headers "${TMP_DIR}/cublas-dev"
copy_headers "${TMP_DIR}/cudart-dev"
copy_libs "${TMP_DIR}/cublas-rt"
copy_libs "${TMP_DIR}/cudart-rt"
make_links "libcublas"
make_links "libcublasLt"
make_links "libcudart"
[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
echo "=== cuBLAS extraction complete ==="
echo "cache: $CACHE_DIR"
echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"

View File

@@ -7,6 +7,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
BUILDER_DIR="${REPO_ROOT}/iso/builder" BUILDER_DIR="${REPO_ROOT}/iso/builder"
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}" CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}" IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}" CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
AUTH_KEYS="" AUTH_KEYS=""
REBUILD_IMAGE=0 REBUILD_IMAGE=0
@@ -40,6 +41,13 @@ if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
exit 1 exit 1
fi fi
PLATFORM_OS="${BUILDER_PLATFORM%/*}"
PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
exit 1
fi
if [ -n "$AUTH_KEYS" ]; then if [ -n "$AUTH_KEYS" ]; then
[ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; } [ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")" AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
@@ -56,17 +64,35 @@ mkdir -p \
IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}" IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then image_matches_platform() {
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
[ "$actual_platform" = "${BUILDER_PLATFORM}" ]
}
NEED_BUILD_IMAGE=0
if [ "$REBUILD_IMAGE" = "1" ]; then
NEED_BUILD_IMAGE=1
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
NEED_BUILD_IMAGE=1
elif ! image_matches_platform; then
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
NEED_BUILD_IMAGE=1
fi
if [ "$NEED_BUILD_IMAGE" = "1" ]; then
"$CONTAINER_TOOL" build \ "$CONTAINER_TOOL" build \
--platform "${BUILDER_PLATFORM}" \
--build-arg GO_VERSION="${GO_VERSION}" \ --build-arg GO_VERSION="${GO_VERSION}" \
-t "${IMAGE_REF}" \ -t "${IMAGE_REF}" \
"${BUILDER_DIR}" "${BUILDER_DIR}"
else else
echo "=== using existing builder image ${IMAGE_REF} ===" echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
fi fi
set -- \ set -- \
run --rm --privileged \ run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \ -v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \ -v "${CACHE_DIR}:/cache" \
-e BEE_CONTAINER_BUILD=1 \ -e BEE_CONTAINER_BUILD=1 \
@@ -80,6 +106,7 @@ set -- \
if [ -n "$AUTH_KEYS" ]; then if [ -n "$AUTH_KEYS" ]; then
set -- run --rm --privileged \ set -- run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \ -v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \ -v "${CACHE_DIR}:/cache" \
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \ -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \

View File

@@ -159,6 +159,16 @@ else
echo "=== bee binary up to date, skipping build ===" echo "=== bee binary up to date, skipping build ==="
fi fi
echo ""
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
sh "${BUILDER_DIR}/build-cublas.sh" \
"${CUBLAS_VERSION}" \
"${CUDA_USERSPACE_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
GPU_STRESS_NEED_BUILD=1 GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
GPU_STRESS_NEED_BUILD=0 GPU_STRESS_NEED_BUILD=0
@@ -167,6 +177,7 @@ fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
echo "=== building bee-gpu-stress ===" echo "=== building bee-gpu-stress ==="
gcc -O2 -s -Wall -Wextra \ gcc -O2 -s -Wall -Wextra \
-I"${CUBLAS_CACHE}/include" \
-o "$GPU_STRESS_BIN" \ -o "$GPU_STRESS_BIN" \
"${BUILDER_DIR}/bee-gpu-stress.c" \ "${BUILDER_DIR}/bee-gpu-stress.c" \
-ldl -ldl
@@ -283,6 +294,10 @@ NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# --- embed build metadata --- # --- embed build metadata ---
mkdir -p "${OVERLAY_STAGE_DIR}/etc" mkdir -p "${OVERLAY_STAGE_DIR}/etc"
BUILD_DATE="$(date +%Y-%m-%d)" BUILD_DATE="$(date +%Y-%m-%d)"
@@ -297,6 +312,8 @@ DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION} NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
NCCL_VERSION=${NCCL_VERSION} NCCL_VERSION=${NCCL_VERSION}
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION} NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
CUBLAS_VERSION=${CUBLAS_VERSION}
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
EOF EOF
# Patch motd with build info # Patch motd with build info

View File

@@ -20,6 +20,7 @@ openssh-server
# Filesystem support for USB export targets # Filesystem support for USB export targets
exfatprogs exfatprogs
exfat-fuse
ntfs-3g ntfs-3g
# Utilities # Utilities