diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 6e4d842..7418811 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -231,8 +231,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) { path, err := a.ExportLatestAudit(target) - body := "Audit exported." - if path != "" { + body := "Audit export failed." + if err == nil { + body = "Audit exported." + } + if err == nil && path != "" { body = "Audit exported to " + path } return ActionResult{Title: "Export audit", Body: body}, err @@ -249,8 +252,11 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) { path, err := a.ExportSupportBundle(target) - body := "Support bundle exported. USB target unmounted and safe to remove." - if path != "" { + body := "Support bundle export failed." + if err == nil { + body = "Support bundle exported. USB target unmounted and safe to remove." + } + if err == nil && path != "" { body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove." } return ActionResult{Title: "Export support bundle", Body: body}, err diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index b606005..0af6708 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -470,6 +470,41 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) { } } +func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) { + t.Parallel() + + tmp := t.TempDir() + oldExportDir := DefaultExportDir + DefaultExportDir = tmp + t.Cleanup(func() { DefaultExportDir = oldExportDir }) + + if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil { + t.Fatalf("write bee-audit.json: %v", err) + } + if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil { + t.Fatalf("write bee-audit.log: %v", err) + } + + a := &App{ + exports: fakeExports{ + exportToTargetFn: func(string, platform.RemovableTarget) (string, error) { + return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build") + }, + }, + } + + result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"}) + if err == nil { + t.Fatal("expected export error") + } + if contains(result.Body, "exported to") { + t.Fatalf("body should not claim success:\n%s", result.Body) + } + if result.Body != "Support bundle export failed." { + t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.") + } +} + func TestRunNvidiaAcceptancePackResult(t *testing.T) { t.Parallel() diff --git a/audit/internal/platform/export.go b/audit/internal/platform/export.go index 5ded3fa..3030642 100644 --- a/audit/internal/platform/export.go +++ b/audit/internal/platform/export.go @@ -11,8 +11,48 @@ import ( var exportExecCommand = exec.Command +func formatMountTargetError(target RemovableTarget, raw string, err error) error { + msg := strings.TrimSpace(raw) + fstype := strings.ToLower(strings.TrimSpace(target.FSType)) + if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") { + return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err) + } + if msg == "" { + return err + } + return fmt.Errorf("%s: %w", msg, err) +} + +func removableTargetReadOnly(fields map[string]string) bool { + if fields["RO"] == "1" { + return true + } + switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) { + case "iso9660", "squashfs": + return true + default: + return false + } +} + +func ensureWritableMountpoint(mountpoint string) error { + probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*") + if err != nil { + return fmt.Errorf("target filesystem is not writable: %w", err) + } + name := probe.Name() + if closeErr := probe.Close(); closeErr != nil { + _ = os.Remove(name) + return closeErr + } + if err := os.Remove(name); err != nil { + return err + } + return nil +} + func (s *System) ListRemovableTargets() ([]RemovableTarget, error) { - raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output() + raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output() if err != nil { return nil, err } @@ -36,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) { } } } - if !removable || fields["FSTYPE"] == "" { + if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) { continue } @@ -72,7 +112,7 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str } if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil { _ = os.Remove(mountpoint) - return string(raw), err + return "", formatMountTargetError(target, string(raw), err) } mountedHere = true mounted = true @@ -95,6 +135,10 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str } }() + if err := ensureWritableMountpoint(mountpoint); err != nil { + return "", err + } + filename := filepath.Base(src) dst = filepath.Join(mountpoint, filename) data, err := os.ReadFile(src) diff --git a/audit/internal/platform/export_test.go b/audit/internal/platform/export_test.go index 35df2b7..54da24f 100644 --- a/audit/internal/platform/export_test.go +++ b/audit/internal/platform/export_test.go @@ -4,12 +4,11 @@ import ( "os" "os/exec" "path/filepath" + "strings" "testing" ) func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) { - t.Parallel() - tmp := t.TempDir() src := filepath.Join(tmp, "bundle.tar.gz") mountpoint := filepath.Join(tmp, "mnt") @@ -54,3 +53,60 @@ func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) { t.Fatalf("expected umount %q call, got %#v", mountpoint, calls) } } + +func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) { + tmp := t.TempDir() + src := filepath.Join(tmp, "bundle.tar.gz") + mountpoint := filepath.Join(tmp, "mnt") + if err := os.MkdirAll(mountpoint, 0755); err != nil { + t.Fatalf("mkdir mountpoint: %v", err) + } + if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil { + t.Fatalf("write src: %v", err) + } + if err := os.Chmod(mountpoint, 0555); err != nil { + t.Fatalf("chmod mountpoint: %v", err) + } + + oldExec := exportExecCommand + exportExecCommand = func(name string, args ...string) *exec.Cmd { + return exec.Command("sh", "-c", "exit 0") + } + t.Cleanup(func() { exportExecCommand = oldExec }) + + s := &System{} + _, err := s.ExportFileToTarget(src, RemovableTarget{ + Device: "/dev/sdb1", + Mountpoint: mountpoint, + }) + if err == nil { + t.Fatal("expected error for non-writable mountpoint") + } + if !strings.Contains(err.Error(), "target filesystem is not writable") { + t.Fatalf("err=%q want writable message", err) + } +} + +func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) { + oldExec := exportExecCommand + lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL="" +NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""` + exportExecCommand = func(name string, args ...string) *exec.Cmd { + cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"") + cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut) + return cmd + } + t.Cleanup(func() { exportExecCommand = oldExec }) + + s := &System{} + targets, err := s.ListRemovableTargets() + if err != nil { + t.Fatalf("ListRemovableTargets error: %v", err) + } + if len(targets) != 1 { + t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets) + } + if got := targets[0].Device; got != "/dev/sdb1" { + t.Fatalf("device=%q want /dev/sdb1", got) + } +} diff --git a/audit/internal/tui/forms.go b/audit/internal/tui/forms.go index 6836668..936d5c5 100644 --- a/audit/internal/tui/forms.go +++ b/audit/internal/tui/forms.go @@ -151,8 +151,10 @@ func (m model) confirmCancelTarget() screen { switch m.pendingAction { case actionExportBundle: return screenExportTargets - case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress: + case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT: return screenHealthCheck + case actionRunFanStress: + return screenBurnInTests default: return screenMain } @@ -165,9 +167,9 @@ func hcFanStressOpts(hcMode int, application interface { // Phase durations per mode: [baseline, load1, pause, load2] type durations struct{ baseline, load1, pause, load2 int } modes := [3]durations{ - {30, 120, 30, 120}, // Quick: ~5 min total - {60, 300, 60, 300}, // Standard: ~12 min total - {60, 600, 120, 600}, // Express: ~24 min total + {30, 120, 30, 120}, // Quick: ~5 min total + {60, 300, 60, 300}, // Standard: ~12 min total + {60, 600, 120, 600}, // Express: ~24 min total } if hcMode < 0 || hcMode >= len(modes) { hcMode = 0 diff --git a/audit/internal/tui/screen_burn_in.go b/audit/internal/tui/screen_burn_in.go new file mode 100644 index 0000000..7013962 --- /dev/null +++ b/audit/internal/tui/screen_burn_in.go @@ -0,0 +1,117 @@ +package tui + +import ( + "fmt" + "strings" + + tea "github.com/charmbracelet/bubbletea" +) + +const ( + burnCurGPUStress = 0 + burnCurModeQuick = 1 + burnCurModeStd = 2 + burnCurModeExpr = 3 + burnCurRun = 4 + burnCurTotal = 5 +) + +func (m model) enterBurnInTests() (tea.Model, tea.Cmd) { + m.screen = screenBurnInTests + m.cursor = 0 + if !m.burnInitialized { + m.burnMode = 0 + m.burnCursor = 0 + m.burnInitialized = true + } + return m, nil +} + +func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) { + switch msg.String() { + case "up", "k": + if m.burnCursor > 0 { + m.burnCursor-- + } + case "down", "j": + if m.burnCursor < burnCurTotal-1 { + m.burnCursor++ + } + case " ": + switch m.burnCursor { + case burnCurModeQuick, burnCurModeStd, burnCurModeExpr: + m.burnMode = m.burnCursor - burnCurModeQuick + } + case "enter": + switch m.burnCursor { + case burnCurGPUStress, burnCurRun: + return m.burnRunSelected() + case burnCurModeQuick, burnCurModeStd, burnCurModeExpr: + m.burnMode = m.burnCursor - burnCurModeQuick + } + case "f", "F", "r", "R": + return m.burnRunSelected() + case "1": + m.burnMode = 0 + case "2": + m.burnMode = 1 + case "3": + m.burnMode = 2 + case "esc": + m.screen = screenMain + m.cursor = 1 + case "q", "ctrl+c": + return m, tea.Quit + } + return m, nil +} + +func (m model) burnRunSelected() (tea.Model, tea.Cmd) { + return m.hcRunFanStress() +} + +func renderBurnInTests(m model) string { + var b strings.Builder + + fmt.Fprintln(&b, "BURN-IN TESTS") + fmt.Fprintln(&b) + fmt.Fprintln(&b, " Stress tests:") + fmt.Fprintln(&b) + + pfx := " " + if m.burnCursor == burnCurGPUStress { + pfx = "> " + } + fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx) + + fmt.Fprintln(&b) + fmt.Fprintln(&b, " Mode:") + modes := []struct{ label, key string }{ + {"Quick", "1"}, + {"Standard", "2"}, + {"Express", "3"}, + } + for i, mode := range modes { + pfx := " " + if m.burnCursor == burnCurModeQuick+i { + pfx = "> " + } + radio := "( )" + if m.burnMode == i { + radio = "(*)" + } + fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key) + } + + fmt.Fprintln(&b) + pfx = " " + if m.burnCursor == burnCurRun { + pfx = "> " + } + fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx) + + fmt.Fprintln(&b) + fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────") + fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back") + return b.String() +} diff --git a/audit/internal/tui/screen_export.go b/audit/internal/tui/screen_export.go index 5624a12..4df5b3b 100644 --- a/audit/internal/tui/screen_export.go +++ b/audit/internal/tui/screen_export.go @@ -4,7 +4,12 @@ import tea "github.com/charmbracelet/bubbletea" func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) { if len(m.targets) == 0 { - return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain) + return m, resultCmd( + "Export support bundle", + "No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list.", + nil, + screenMain, + ) } target := m.targets[m.cursor] m.selectedTarget = &target diff --git a/audit/internal/tui/screen_health_check.go b/audit/internal/tui/screen_health_check.go index d2a09be..9f0def1 100644 --- a/audit/internal/tui/screen_health_check.go +++ b/audit/internal/tui/screen_health_check.go @@ -21,17 +21,16 @@ const ( // Cursor positions in Health Check screen. const ( - hcCurGPU = 0 - hcCurMemory = 1 - hcCurStorage = 2 - hcCurCPU = 3 - hcCurSelectAll = 4 - hcCurModeQuick = 5 - hcCurModeStd = 6 - hcCurModeExpr = 7 - hcCurRunAll = 8 - hcCurFanStress = 9 - hcCurTotal = 10 + hcCurGPU = 0 + hcCurMemory = 1 + hcCurStorage = 2 + hcCurCPU = 3 + hcCurSelectAll = 4 + hcCurModeQuick = 5 + hcCurModeStd = 6 + hcCurModeExpr = 7 + hcCurRunAll = 8 + hcCurTotal = 9 ) // hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds. @@ -86,8 +85,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) { m.hcMode = m.hcCursor - hcCurModeQuick case hcCurRunAll: return m.hcRunAll() - case hcCurFanStress: - return m.hcRunFanStress() } case "g", "G": return m.hcRunSingle(hcGPU) @@ -99,8 +96,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) { return m.hcRunSingle(hcCPU) case "r", "R": return m.hcRunAll() - case "f", "F": - return m.hcRunFanStress() case "a", "A": allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3] for i := range m.hcSel { @@ -160,7 +155,7 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) { // startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart. func (m model) startGPUStressTest() (tea.Model, tea.Cmd) { - opts := hcFanStressOpts(m.hcMode, m.app) + opts := hcFanStressOpts(m.burnMode, m.app) ctx, cancel := context.WithCancel(context.Background()) m.gpuStressCancel = cancel @@ -197,7 +192,8 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) { m.gpuStressCancel = nil } m.gpuStressAborted = true - m.screen = screenHealthCheck + m.screen = screenBurnInTests + m.burnCursor = burnCurGPUStress m.cursor = 0 case "ctrl+c": return m, tea.Quit @@ -380,16 +376,8 @@ func renderHealthCheck(m model) string { fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx) } - { - pfx := " " - if m.hcCursor == hcCurFanStress { - pfx = "> " - } - fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx) - } - fmt.Fprintln(&b) fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────") - fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] gpu stress [Esc] back") + fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back") return b.String() } diff --git a/audit/internal/tui/screen_main.go b/audit/internal/tui/screen_main.go index a330eb6..50f6187 100644 --- a/audit/internal/tui/screen_main.go +++ b/audit/internal/tui/screen_main.go @@ -8,7 +8,9 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) { switch m.cursor { case 0: // Health Check return m.enterHealthCheck() - case 1: // Export support bundle + case 1: // Burn-in tests + return m.enterBurnInTests() + case 2: // Export support bundle m.pendingAction = actionExportBundle m.busy = true m.busyTitle = "Export support bundle" @@ -16,11 +18,11 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) { targets, err := m.app.ListRemovableTargets() return exportTargetsMsg{targets: targets, err: err} } - case 2: // Settings + case 3: // Settings m.screen = screenSettings m.cursor = 0 return m, nil - case 3: // Exit + case 4: // Exit return m, tea.Quit } return m, nil diff --git a/audit/internal/tui/tui_test.go b/audit/internal/tui/tui_test.go index 5e02faa..b71b451 100644 --- a/audit/internal/tui/tui_test.go +++ b/audit/internal/tui/tui_test.go @@ -54,9 +54,10 @@ func TestUpdateMainMenuEnterActions(t *testing.T) { wantCmd bool }{ {name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true}, - {name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true}, - {name: "settings", cursor: 2, wantScreen: screenSettings, wantCmd: true}, - {name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true}, + {name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests, wantCmd: true}, + {name: "export", cursor: 2, wantScreen: screenMain, wantBusy: true, wantCmd: true}, + {name: "settings", cursor: 3, wantScreen: screenSettings, wantCmd: true}, + {name: "exit", cursor: 4, wantScreen: screenMain, wantCmd: true}, } for _, test := range tests { @@ -115,7 +116,8 @@ func TestMainMenuSimpleTransitions(t *testing.T) { wantScreen screen }{ {name: "health_check", cursor: 0, wantScreen: screenHealthCheck}, - {name: "settings", cursor: 2, wantScreen: screenSettings}, + {name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests}, + {name: "settings", cursor: 3, wantScreen: screenSettings}, } for _, test := range tests { @@ -146,7 +148,7 @@ func TestMainMenuExportSetsBusy(t *testing.T) { t.Parallel() m := newTestModel() - m.cursor = 1 // Export support bundle + m.cursor = 2 // Export support bundle next, cmd := m.handleMainMenu() got := next.(model) @@ -163,12 +165,13 @@ func TestMainViewRendersTwoColumns(t *testing.T) { t.Parallel() m := newTestModel() - m.cursor = 1 + m.cursor = 2 view := m.View() for _, want := range []string{ "bee", "Health Check", + "Burn-in tests", "> Export support bundle", "Settings", "Exit", @@ -400,6 +403,11 @@ func TestConfirmCancelTarget(t *testing.T) { t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck) } + m.pendingAction = actionRunFanStress + if got := m.confirmCancelTarget(); got != screenBurnInTests { + t.Fatalf("fan stress cancel target=%q want %q", got, screenBurnInTests) + } + m.pendingAction = actionNone if got := m.confirmCancelTarget(); got != screenMain { t.Fatalf("default cancel target=%q want %q", got, screenMain) @@ -439,6 +447,68 @@ func TestViewBusyStateUsesBusyTitle(t *testing.T) { } } +func TestBurnInTestsEscReturnsToMain(t *testing.T) { + t.Parallel() + + m := newTestModel() + m.screen = screenBurnInTests + m.burnCursor = 3 + + next, _ := m.updateBurnInTests(tea.KeyMsg{Type: tea.KeyEsc}) + got := next.(model) + + if got.screen != screenMain { + t.Fatalf("screen=%q want %q", got.screen, screenMain) + } + if got.cursor != 1 { + t.Fatalf("cursor=%d want 1", got.cursor) + } +} + +func TestBurnInTestsRunOpensConfirm(t *testing.T) { + t.Parallel() + + m := newTestModel() + m.screen = screenBurnInTests + m.burnInitialized = true + m.burnMode = 2 + + next, _ := m.burnRunSelected() + got := next.(model) + + if got.screen != screenConfirm { + t.Fatalf("screen=%q want %q", got.screen, screenConfirm) + } + if got.pendingAction != actionRunFanStress { + t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunFanStress) + } + if got.cursor != 0 { + t.Fatalf("cursor=%d want 0", got.cursor) + } +} + +func TestViewBurnInTestsRendersGPUStressEntry(t *testing.T) { + t.Parallel() + + m := newTestModel() + m.screen = screenBurnInTests + + view := m.View() + + for _, want := range []string{ + "BURN-IN TESTS", + "GPU PLATFORM STRESS TEST", + "Quick", + "Standard", + "Express", + "[ RUN SELECTED [R] ]", + } { + if !strings.Contains(view, want) { + t.Fatalf("view missing %q\nview:\n%s", want, view) + } + } +} + func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) { t.Parallel() @@ -528,7 +598,7 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) { for _, want := range []string{ "Export support bundle", - "Select removable filesystem", + "Select writable removable filesystem (read-only/boot media hidden)", "> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee", } { if !strings.Contains(view, want) { @@ -537,6 +607,32 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) { } } +func TestExportTargetsMsgEmptyShowsHiddenBootMediaHint(t *testing.T) { + t.Parallel() + + m := newTestModel() + m.busy = true + m.busyTitle = "Export support bundle" + + next, _ := m.Update(exportTargetsMsg{}) + got := next.(model) + + if got.screen != screenOutput { + t.Fatalf("screen=%q want %q", got.screen, screenOutput) + } + if got.title != "Export support bundle" { + t.Fatalf("title=%q want %q", got.title, "Export support bundle") + } + for _, want := range []string{ + "No writable removable filesystems found.", + "Read-only or boot media are hidden from this list.", + } { + if !strings.Contains(got.body, want) { + t.Fatalf("body missing %q\nbody:\n%s", want, got.body) + } + } +} + func TestViewStaticFormRendersFields(t *testing.T) { t.Parallel() diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go index 9c10778..3f0c148 100644 --- a/audit/internal/tui/types.go +++ b/audit/internal/tui/types.go @@ -16,6 +16,7 @@ type screen string const ( screenMain screen = "main" screenHealthCheck screen = "health_check" + screenBurnInTests screen = "burn_in_tests" screenSettings screen = "settings" screenNetwork screen = "network" screenInterfacePick screen = "interface_pick" @@ -41,8 +42,8 @@ const ( actionRunMemorySAT actionKind = "run_memory_sat" actionRunStorageSAT actionKind = "run_storage_sat" actionRunCPUSAT actionKind = "run_cpu_sat" - actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat" - actionRunFanStress actionKind = "run_fan_stress" + actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat" + actionRunFanStress actionKind = "run_fan_stress" ) type model struct { @@ -84,6 +85,11 @@ type model struct { hcCursor int hcInitialized bool + // Burn-in tests screen + burnMode int + burnCursor int + burnInitialized bool + // NVIDIA SAT setup nvidiaGPUs []platform.NvidiaGPU nvidiaGPUSel []bool @@ -97,9 +103,9 @@ type model struct { // GPU Platform Stress Test running gpuStressCancel func() gpuStressAborted bool - gpuLiveRows []platform.GPUMetricRow - gpuLiveIndices []int - gpuLiveStart time.Time + gpuLiveRows []platform.GPUMetricRow + gpuLiveIndices []int + gpuLiveStart time.Time // SAT verbose progress (CPU / Memory / Storage / AMD GPU) progressLines []string @@ -132,6 +138,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model { screen: screenMain, mainMenu: []string{ "Health Check", + "Burn-in tests", "Export support bundle", "Settings", "Exit", @@ -201,7 +208,7 @@ func (m model) confirmBody() (string, string) { modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"} return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" + "Monitors fans, temps, power — detects throttling.\n" + - "Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed." + "Mode: " + modes[m.burnMode] + "\n\nAll NVIDIA GPUs will be stressed." default: return "Confirm", "Proceed?" } diff --git a/audit/internal/tui/update.go b/audit/internal/tui/update.go index ecea4af..ce7d79f 100644 --- a/audit/internal/tui/update.go +++ b/audit/internal/tui/update.go @@ -101,6 +101,13 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { m.screen = screenOutput return m, m.refreshSnapshotCmd() } + if len(msg.targets) == 0 { + m.title = "Export support bundle" + m.body = "No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list." + m.prevScreen = screenMain + m.screen = screenOutput + return m, m.refreshSnapshotCmd() + } m.targets = msg.targets m.screen = screenExportTargets m.cursor = 0 @@ -117,7 +124,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { m.gpuStressCancel() m.gpuStressCancel = nil } - m.prevScreen = screenHealthCheck + m.prevScreen = screenBurnInTests m.screen = screenOutput m.title = msg.title if msg.err != nil { @@ -179,6 +186,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) { return m.updateMain(msg) case screenHealthCheck: return m.updateHealthCheck(msg) + case screenBurnInTests: + return m.updateBurnInTests(msg) case screenSettings: return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu) case screenNetwork: diff --git a/audit/internal/tui/view.go b/audit/internal/tui/view.go index 62b0b67..487ee03 100644 --- a/audit/internal/tui/view.go +++ b/audit/internal/tui/view.go @@ -57,6 +57,8 @@ func (m model) View() string { body = renderTwoColumnMain(m) case screenHealthCheck: body = renderHealthCheck(m) + case screenBurnInTests: + body = renderBurnInTests(m) case screenSettings: body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor) case screenNetwork: @@ -66,7 +68,12 @@ func (m model) View() string { case screenServiceAction: body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor) case screenExportTargets: - body = renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor) + body = renderMenu( + "Export support bundle", + "Select writable removable filesystem (read-only/boot media hidden)", + renderTargetItems(m.targets), + m.cursor, + ) case screenInterfacePick: body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor) case screenStaticForm: diff --git a/bible-local/architecture/runtime-flows.md b/bible-local/architecture/runtime-flows.md index 8c795d6..e0305d1 100644 --- a/bible-local/architecture/runtime-flows.md +++ b/bible-local/architecture/runtime-flows.md @@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available. ## Boot sequence (single ISO) +The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable. + `systemd` boot order: ``` @@ -25,6 +27,7 @@ local-fs.target ``` **Critical invariants:** +- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot. - OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`. - `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking. - `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`. @@ -71,24 +74,39 @@ build-in-container.sh [--authorized-keys /path/to/keys] d. build kernel modules against Debian headers e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache f. cache in `dist/nvidia--/` - 7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/` - 8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi` - 9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/` - 10. write staged `/etc/bee-release` (versions + git commit) - 11. patch staged `motd` with build metadata - 12. copy `iso/builder/` into a temporary live-build workdir under `dist/` - 13. sync staged overlay into workdir `config/includes.chroot/` - 14. run `lb config && lb build` inside the privileged builder container + 7. `build-cublas.sh`: + a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo + b. verify packages against repo `Packages.gz` + c. extract headers for `bee-gpu-stress` build + d. cache userspace libs in `dist/cublas-+cuda/` + 8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers + 9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/` + 10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi` + 11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/` + 12. write staged `/etc/bee-release` (versions + git commit) + 13. patch staged `motd` with build metadata + 14. copy `iso/builder/` into a temporary live-build workdir under `dist/` + 15. sync staged overlay into workdir `config/includes.chroot/` + 16. run `lb config && lb build` inside the privileged builder container ``` +Build host notes: +- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon. +- Override with `BEE_BUILDER_PLATFORM=` only if you intentionally need a different container platform. +- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically. + **Critical invariants:** - `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places: 1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build 2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO - NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules//extra/`. +- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers. +- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot. - The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay. - The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean. - Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly. +- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`. +- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state. ## Post-boot smoke test @@ -131,10 +149,15 @@ Current validation state: Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal. Acceptance flows: -- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress` +- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress` - `bee sat memory` → `memtester` archive - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported - SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`) +- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path: + - Ampere: `fp16` + `fp32`/TF32 tensor-core load + - Ada / Hopper: add `fp8` + - Blackwell+: add `fp4` + - PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes - Runtime overrides: - `BEE_GPU_STRESS_SECONDS` - `BEE_GPU_STRESS_SIZE_MB` diff --git a/bible-local/architecture/system-overview.md b/bible-local/architecture/system-overview.md index 960db4a..92d1e82 100644 --- a/bible-local/architecture/system-overview.md +++ b/bible-local/architecture/system-overview.md @@ -21,7 +21,8 @@ Fills gaps where Redfish/logpile is blind: - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID - Machine-readable health summary derived from collector verdicts - Operator-triggered acceptance tests for NVIDIA, memory, and storage -- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress` +- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress` +- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable - Automatic boot audit with operator-facing local console and SSH access - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi` - SSH access (OpenSSH) always available for inspection and debugging @@ -69,6 +70,7 @@ Fills gaps where Redfish/logpile is blind: | SSH | OpenSSH server | | NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers | | NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` | +| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback | | Builder | Debian 12 host/VM or Debian 12 container image | ## Operator UX @@ -78,6 +80,7 @@ Fills gaps where Redfish/logpile is blind: - The TUI itself executes privileged actions as `root` via `sudo -n` - SSH remains available independently of the local console path - VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging +- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries ## Runtime split @@ -85,6 +88,7 @@ Fills gaps where Redfish/logpile is blind: - Live-ISO-only responsibilities stay in `iso/` integration code - Live ISO launches the Go CLI with `--runtime livecd` - Local/manual runs use `--runtime auto` or `--runtime local` +- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup ## Key paths diff --git a/iso/README.md b/iso/README.md new file mode 100644 index 0000000..31792b7 --- /dev/null +++ b/iso/README.md @@ -0,0 +1,58 @@ +# ISO Build + +`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`. + +## Requirements + +- Docker Desktop or another Docker-compatible container runtime +- Privileged containers enabled +- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages + +## Build On macOS + +From the repository root: + +```sh +sh iso/builder/build-in-container.sh +``` + +The script defaults to `linux/amd64` builder containers, so it works on: + +- Intel Mac +- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM + +You do not need to pass `--platform` manually for normal ISO builds. + +## Useful Options + +Build with explicit SSH keys baked into the ISO: + +```sh +sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub +``` + +Rebuild the builder image: + +```sh +sh iso/builder/build-in-container.sh --rebuild-image +``` + +Use a custom cache directory: + +```sh +sh iso/builder/build-in-container.sh --cache-dir /path/to/cache +``` + +## Notes + +- The builder image is automatically rebuilt if the local tag exists for the wrong architecture. +- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present. +- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI. +- Override the container platform only if you know why: + +```sh +BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh +``` + +- The shipped ISO is still `amd64`. +- Output ISO artifacts are written under `dist/`. diff --git a/iso/builder/VERSIONS b/iso/builder/VERSIONS index bf48d6e..db45732 100644 --- a/iso/builder/VERSIONS +++ b/iso/builder/VERSIONS @@ -4,5 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01 NCCL_VERSION=2.28.9-1 NCCL_CUDA_VERSION=13.0 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186 +CUBLAS_VERSION=13.0.2.14-1 +CUDA_USERSPACE_VERSION=13.0.96-1 GO_VERSION=1.24.0 AUDIT_VERSION=1.0.0 diff --git a/iso/builder/auto/config b/iso/builder/auto/config index a0845cb..bf1d7cc 100755 --- a/iso/builder/auto/config +++ b/iso/builder/auto/config @@ -32,6 +32,6 @@ lb config noauto \ --memtest none \ --iso-volume "EASY-BEE" \ --iso-application "EASY-BEE" \ - --bootappend-live "boot=live components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ + --bootappend-live "boot=live toram components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --apt-recommends false \ "${@}" diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c index b11f41b..2d0c212 100644 --- a/iso/builder/bee-gpu-stress.c +++ b/iso/builder/bee-gpu-stress.c @@ -1,12 +1,25 @@ #define _POSIX_C_SOURCE 200809L #include +#include +#include #include #include #include #include #include +#if defined(__has_include) +#if __has_include() +#include +#define HAVE_CUBLASLT_HEADERS 1 +#else +#define HAVE_CUBLASLT_HEADERS 0 +#endif +#else +#define HAVE_CUBLASLT_HEADERS 0 +#endif + typedef int CUdevice; typedef uint64_t CUdeviceptr; typedef int CUresult; @@ -16,6 +29,8 @@ typedef void *CUfunction; typedef void *CUstream; #define CU_SUCCESS 0 +#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75 +#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76 static const char *ptx_source = ".version 6.0\n" @@ -60,11 +75,13 @@ typedef CUresult (*cuInit_fn)(unsigned int); typedef CUresult (*cuDeviceGetCount_fn)(int *); typedef CUresult (*cuDeviceGet_fn)(CUdevice *, int); typedef CUresult (*cuDeviceGetName_fn)(char *, int, CUdevice); +typedef CUresult (*cuDeviceGetAttribute_fn)(int *, int, CUdevice); typedef CUresult (*cuCtxCreate_fn)(CUcontext *, unsigned int, CUdevice); typedef CUresult (*cuCtxDestroy_fn)(CUcontext); typedef CUresult (*cuCtxSynchronize_fn)(void); typedef CUresult (*cuMemAlloc_fn)(CUdeviceptr *, size_t); typedef CUresult (*cuMemFree_fn)(CUdeviceptr); +typedef CUresult (*cuMemsetD8_fn)(CUdeviceptr, unsigned char, size_t); typedef CUresult (*cuMemcpyHtoD_fn)(CUdeviceptr, const void *, size_t); typedef CUresult (*cuMemcpyDtoH_fn)(void *, CUdeviceptr, size_t); typedef CUresult (*cuModuleLoadDataEx_fn)(CUmodule *, const void *, unsigned int, void *, void *); @@ -89,11 +106,13 @@ struct cuda_api { cuDeviceGetCount_fn cuDeviceGetCount; cuDeviceGet_fn cuDeviceGet; cuDeviceGetName_fn cuDeviceGetName; + cuDeviceGetAttribute_fn cuDeviceGetAttribute; cuCtxCreate_fn cuCtxCreate; cuCtxDestroy_fn cuCtxDestroy; cuCtxSynchronize_fn cuCtxSynchronize; cuMemAlloc_fn cuMemAlloc; cuMemFree_fn cuMemFree; + cuMemsetD8_fn cuMemsetD8; cuMemcpyHtoD_fn cuMemcpyHtoD; cuMemcpyDtoH_fn cuMemcpyDtoH; cuModuleLoadDataEx_fn cuModuleLoadDataEx; @@ -103,6 +122,17 @@ struct cuda_api { cuGetErrorString_fn cuGetErrorString; }; +struct stress_report { + char backend[32]; + char device[128]; + int cc_major; + int cc_minor; + int buffer_mb; + unsigned long iterations; + uint64_t checksum; + char details[1024]; +}; + static int load_symbol(void *lib, const char *name, void **out) { *out = dlsym(lib, name); return *out != NULL; @@ -119,11 +149,13 @@ static int load_cuda(struct cuda_api *api) { load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) && load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) && load_symbol(api->lib, "cuDeviceGetName", (void **)&api->cuDeviceGetName) && + load_symbol(api->lib, "cuDeviceGetAttribute", (void **)&api->cuDeviceGetAttribute) && load_symbol(api->lib, "cuCtxCreate_v2", (void **)&api->cuCtxCreate) && load_symbol(api->lib, "cuCtxDestroy_v2", (void **)&api->cuCtxDestroy) && load_symbol(api->lib, "cuCtxSynchronize", (void **)&api->cuCtxSynchronize) && load_symbol(api->lib, "cuMemAlloc_v2", (void **)&api->cuMemAlloc) && load_symbol(api->lib, "cuMemFree_v2", (void **)&api->cuMemFree) && + load_symbol(api->lib, "cuMemsetD8_v2", (void **)&api->cuMemsetD8) && load_symbol(api->lib, "cuMemcpyHtoD_v2", (void **)&api->cuMemcpyHtoD) && load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) && load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) && @@ -161,6 +193,905 @@ static double now_seconds(void) { return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0); } +#if HAVE_CUBLASLT_HEADERS +static size_t round_down_size(size_t value, size_t multiple) { + if (multiple == 0 || value < multiple) { + return value; + } + return value - (value % multiple); +} +#endif + +static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) { + int cc_major = 0; + int cc_minor = 0; + if (!check_rc(api, + "cuDeviceGetAttribute(major)", + api->cuDeviceGetAttribute(&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev))) { + return 0; + } + if (!check_rc(api, + "cuDeviceGetAttribute(minor)", + api->cuDeviceGetAttribute(&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev))) { + return 0; + } + *major = cc_major; + *minor = cc_minor; + return 1; +} + +#if HAVE_CUBLASLT_HEADERS +static void append_detail(char *buf, size_t cap, const char *fmt, ...) { + size_t len = strlen(buf); + if (len >= cap) { + return; + } + va_list ap; + va_start(ap, fmt); + vsnprintf(buf + len, cap - len, fmt, ap); + va_end(ap); +} +#endif + +static int run_ptx_fallback(struct cuda_api *api, + CUdevice dev, + const char *device_name, + int cc_major, + int cc_minor, + int seconds, + int size_mb, + struct stress_report *report) { + CUcontext ctx = NULL; + CUdeviceptr device_mem = 0; + CUmodule module = NULL; + CUfunction kernel = NULL; + uint32_t sample[256]; + uint32_t words = 0; + unsigned long iterations = 0; + + memset(report, 0, sizeof(*report)); + snprintf(report->backend, sizeof(report->backend), "driver-ptx"); + snprintf(report->device, sizeof(report->device), "%s", device_name); + report->cc_major = cc_major; + report->cc_minor = cc_minor; + report->buffer_mb = size_mb; + + if (!check_rc(api, "cuCtxCreate", api->cuCtxCreate(&ctx, 0, dev))) { + return 0; + } + + size_t bytes = (size_t)size_mb * 1024u * 1024u; + if (bytes < 4u * 1024u * 1024u) { + bytes = 4u * 1024u * 1024u; + } + if (bytes > (size_t)1024u * 1024u * 1024u) { + bytes = (size_t)1024u * 1024u * 1024u; + } + words = (uint32_t)(bytes / sizeof(uint32_t)); + + if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem, bytes))) { + api->cuCtxDestroy(ctx); + return 0; + } + if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem, 0, bytes))) { + api->cuMemFree(device_mem); + api->cuCtxDestroy(ctx); + return 0; + } + if (!check_rc(api, + "cuModuleLoadDataEx", + api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) { + api->cuMemFree(device_mem); + api->cuCtxDestroy(ctx); + return 0; + } + if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) { + api->cuMemFree(device_mem); + api->cuCtxDestroy(ctx); + return 0; + } + + unsigned int threads = 256; + unsigned int blocks = (unsigned int)((words + threads - 1) / threads); + uint32_t rounds = 1024; + void *params[] = {&device_mem, &words, &rounds}; + + double start = now_seconds(); + double deadline = start + (double)seconds; + while (now_seconds() < deadline) { + if (!check_rc(api, + "cuLaunchKernel", + api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) { + api->cuMemFree(device_mem); + api->cuCtxDestroy(ctx); + return 0; + } + iterations++; + } + + if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) { + api->cuMemFree(device_mem); + api->cuCtxDestroy(ctx); + return 0; + } + if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem, sizeof(sample)))) { + api->cuMemFree(device_mem); + api->cuCtxDestroy(ctx); + return 0; + } + + for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) { + report->checksum += sample[i]; + } + report->iterations = iterations; + snprintf(report->details, + sizeof(report->details), + "profile_int32_fallback=OK iterations=%lu\n", + iterations); + + api->cuMemFree(device_mem); + api->cuCtxDestroy(ctx); + return 1; +} + +#if HAVE_CUBLASLT_HEADERS +typedef cublasStatus_t (*cublasLtCreate_fn)(cublasLtHandle_t *); +typedef cublasStatus_t (*cublasLtDestroy_fn)(cublasLtHandle_t); +typedef cublasStatus_t (*cublasLtMatmulDescCreate_fn)(cublasLtMatmulDesc_t *, + cublasComputeType_t, + cudaDataType_t); +typedef cublasStatus_t (*cublasLtMatmulDescDestroy_fn)(cublasLtMatmulDesc_t); +typedef cublasStatus_t (*cublasLtMatmulDescSetAttribute_fn)(cublasLtMatmulDesc_t, + cublasLtMatmulDescAttributes_t, + const void *, + size_t); +typedef cublasStatus_t (*cublasLtMatrixLayoutCreate_fn)(cublasLtMatrixLayout_t *, + cudaDataType_t, + uint64_t, + uint64_t, + int64_t); +typedef cublasStatus_t (*cublasLtMatrixLayoutDestroy_fn)(cublasLtMatrixLayout_t); +typedef cublasStatus_t (*cublasLtMatmulPreferenceCreate_fn)(cublasLtMatmulPreference_t *); +typedef cublasStatus_t (*cublasLtMatmulPreferenceDestroy_fn)(cublasLtMatmulPreference_t); +typedef cublasStatus_t (*cublasLtMatmulPreferenceSetAttribute_fn)(cublasLtMatmulPreference_t, + cublasLtMatmulPreferenceAttributes_t, + const void *, + size_t); +typedef cublasStatus_t (*cublasLtMatmulAlgoGetHeuristic_fn)( + cublasLtHandle_t, + cublasLtMatmulDesc_t, + cublasLtMatrixLayout_t, + cublasLtMatrixLayout_t, + cublasLtMatrixLayout_t, + cublasLtMatrixLayout_t, + cublasLtMatmulPreference_t, + int, + cublasLtMatmulHeuristicResult_t *, + int *); +typedef cublasStatus_t (*cublasLtMatmul_fn)(cublasLtHandle_t, + cublasLtMatmulDesc_t, + const void *, + const void *, + cublasLtMatrixLayout_t, + const void *, + cublasLtMatrixLayout_t, + const void *, + const void *, + cublasLtMatrixLayout_t, + void *, + cublasLtMatrixLayout_t, + const cublasLtMatmulAlgo_t *, + void *, + size_t, + cudaStream_t); + +struct cublaslt_api { + void *lib; + cublasLtCreate_fn cublasLtCreate; + cublasLtDestroy_fn cublasLtDestroy; + cublasLtMatmulDescCreate_fn cublasLtMatmulDescCreate; + cublasLtMatmulDescDestroy_fn cublasLtMatmulDescDestroy; + cublasLtMatmulDescSetAttribute_fn cublasLtMatmulDescSetAttribute; + cublasLtMatrixLayoutCreate_fn cublasLtMatrixLayoutCreate; + cublasLtMatrixLayoutDestroy_fn cublasLtMatrixLayoutDestroy; + cublasLtMatmulPreferenceCreate_fn cublasLtMatmulPreferenceCreate; + cublasLtMatmulPreferenceDestroy_fn cublasLtMatmulPreferenceDestroy; + cublasLtMatmulPreferenceSetAttribute_fn cublasLtMatmulPreferenceSetAttribute; + cublasLtMatmulAlgoGetHeuristic_fn cublasLtMatmulAlgoGetHeuristic; + cublasLtMatmul_fn cublasLtMatmul; +}; + +struct profile_desc { + const char *name; + const char *block_label; + int min_cc; + int enabled; + int needs_scalar_scale; + int needs_block_scale; + int min_multiple; + cudaDataType_t a_type; + cudaDataType_t b_type; + cudaDataType_t c_type; + cudaDataType_t d_type; + cublasComputeType_t compute_type; +}; + +struct prepared_profile { + struct profile_desc desc; + cublasLtMatmulDesc_t op_desc; + cublasLtMatrixLayout_t a_layout; + cublasLtMatrixLayout_t b_layout; + cublasLtMatrixLayout_t c_layout; + cublasLtMatrixLayout_t d_layout; + cublasLtMatmulPreference_t preference; + cublasLtMatmulHeuristicResult_t heuristic; + CUdeviceptr a_dev; + CUdeviceptr b_dev; + CUdeviceptr c_dev; + CUdeviceptr d_dev; + CUdeviceptr a_scale_dev; + CUdeviceptr b_scale_dev; + CUdeviceptr workspace_dev; + size_t workspace_size; + uint64_t m; + uint64_t n; + uint64_t k; + unsigned long iterations; + int ready; +}; + +static const struct profile_desc k_profiles[] = { + { + "fp32_tf32", + "fp32", + 80, + 1, + 0, + 0, + 128, + CUDA_R_32F, + CUDA_R_32F, + CUDA_R_32F, + CUDA_R_32F, + CUBLAS_COMPUTE_32F_FAST_TF32, + }, + { + "fp16_tensor", + "fp16", + 80, + 1, + 0, + 0, + 128, + CUDA_R_16F, + CUDA_R_16F, + CUDA_R_16F, + CUDA_R_16F, + CUBLAS_COMPUTE_32F_FAST_16F, + }, + { + "fp8_e4m3", + "fp8", + 89, + 1, + 1, + 0, + 128, + CUDA_R_8F_E4M3, + CUDA_R_8F_E4M3, + CUDA_R_16BF, + CUDA_R_16BF, + CUBLAS_COMPUTE_32F, + }, + { + "fp8_e5m2", + "fp8", + 89, + 1, + 1, + 0, + 128, + CUDA_R_8F_E5M2, + CUDA_R_8F_E5M2, + CUDA_R_16BF, + CUDA_R_16BF, + CUBLAS_COMPUTE_32F, + }, +#if defined(CUDA_R_4F_E2M1) && defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) + { + "fp4_e2m1", + "fp4", + 100, + 1, + 0, + 1, + 128, + CUDA_R_4F_E2M1, + CUDA_R_4F_E2M1, + CUDA_R_16BF, + CUDA_R_16BF, + CUBLAS_COMPUTE_32F, + }, +#endif +}; + +static int load_cublaslt(struct cublaslt_api *api) { + memset(api, 0, sizeof(*api)); + api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL); + if (!api->lib) { + api->lib = dlopen("libcublasLt.so", RTLD_NOW | RTLD_LOCAL); + } + if (!api->lib) { + return 0; + } + return + load_symbol(api->lib, "cublasLtCreate", (void **)&api->cublasLtCreate) && + load_symbol(api->lib, "cublasLtDestroy", (void **)&api->cublasLtDestroy) && + load_symbol(api->lib, "cublasLtMatmulDescCreate", (void **)&api->cublasLtMatmulDescCreate) && + load_symbol(api->lib, "cublasLtMatmulDescDestroy", (void **)&api->cublasLtMatmulDescDestroy) && + load_symbol(api->lib, + "cublasLtMatmulDescSetAttribute", + (void **)&api->cublasLtMatmulDescSetAttribute) && + load_symbol(api->lib, "cublasLtMatrixLayoutCreate", (void **)&api->cublasLtMatrixLayoutCreate) && + load_symbol(api->lib, "cublasLtMatrixLayoutDestroy", (void **)&api->cublasLtMatrixLayoutDestroy) && + load_symbol(api->lib, + "cublasLtMatmulPreferenceCreate", + (void **)&api->cublasLtMatmulPreferenceCreate) && + load_symbol(api->lib, + "cublasLtMatmulPreferenceDestroy", + (void **)&api->cublasLtMatmulPreferenceDestroy) && + load_symbol(api->lib, + "cublasLtMatmulPreferenceSetAttribute", + (void **)&api->cublasLtMatmulPreferenceSetAttribute) && + load_symbol(api->lib, + "cublasLtMatmulAlgoGetHeuristic", + (void **)&api->cublasLtMatmulAlgoGetHeuristic) && + load_symbol(api->lib, "cublasLtMatmul", (void **)&api->cublasLtMatmul); +} + +static const char *cublas_status_text(cublasStatus_t status) { + switch (status) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + default: + return "CUBLAS_STATUS_UNKNOWN"; + } +} + +static int check_cublas(const char *step, cublasStatus_t status) { + if (status == CUBLAS_STATUS_SUCCESS) { + return 1; + } + fprintf(stderr, "%s failed: %s (%d)\n", step, cublas_status_text(status), (int)status); + return 0; +} + +static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) { + switch (type) { + case CUDA_R_32F: + return (size_t)(elements * 4u); + case CUDA_R_16F: + case CUDA_R_16BF: + return (size_t)(elements * 2u); + case CUDA_R_8F_E4M3: + case CUDA_R_8F_E5M2: + return (size_t)(elements); +#if defined(CUDA_R_4F_E2M1) + case CUDA_R_4F_E2M1: + return (size_t)((elements + 1u) / 2u); +#endif + default: + return (size_t)(elements * 4u); + } +} + +static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) { + uint64_t row_tiles = (rows + 127u) / 128u; + uint64_t col_tiles = (cols + 63u) / 64u; + return (size_t)(row_tiles * col_tiles * 128u); +} + +static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, int multiple) { + double approx = sqrt((double)budget_bytes / (double)bytes_per_cell); + uint64_t dim = (uint64_t)approx; + if (dim < (uint64_t)multiple) { + dim = (uint64_t)multiple; + } + dim = (uint64_t)round_down_size((size_t)dim, (size_t)multiple); + if (dim < (uint64_t)multiple) { + dim = (uint64_t)multiple; + } + if (dim > 8192u) { + dim = 8192u; + } + return dim; +} + +static int device_upload(struct cuda_api *cuda, CUdeviceptr dev, const void *src, size_t bytes) { + return check_rc(cuda, "cuMemcpyHtoD", cuda->cuMemcpyHtoD(dev, src, bytes)); +} + +static int alloc_filled(struct cuda_api *cuda, CUdeviceptr *ptr, size_t bytes, unsigned char pattern) { + if (!check_rc(cuda, "cuMemAlloc", cuda->cuMemAlloc(ptr, bytes))) { + return 0; + } + if (!check_rc(cuda, "cuMemsetD8", cuda->cuMemsetD8(*ptr, pattern, bytes))) { + cuda->cuMemFree(*ptr); + *ptr = 0; + return 0; + } + return 1; +} + +static size_t profile_scale_bytes(const struct profile_desc *desc, uint64_t m, uint64_t n, uint64_t k) { + size_t bytes = 0; + if (desc->needs_scalar_scale) { + bytes += 2u * sizeof(float); + } +#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) + if (desc->needs_block_scale) { + bytes += fp4_scale_bytes(k, m); + bytes += fp4_scale_bytes(k, n); + } +#else + (void)m; + (void)n; + (void)k; +#endif + return bytes; +} + +static void destroy_profile(struct cublaslt_api *cublas, struct cuda_api *cuda, struct prepared_profile *profile) { + if (profile->workspace_dev) { + cuda->cuMemFree(profile->workspace_dev); + } + if (profile->a_scale_dev) { + cuda->cuMemFree(profile->a_scale_dev); + } + if (profile->b_scale_dev) { + cuda->cuMemFree(profile->b_scale_dev); + } + if (profile->d_dev) { + cuda->cuMemFree(profile->d_dev); + } + if (profile->c_dev) { + cuda->cuMemFree(profile->c_dev); + } + if (profile->b_dev) { + cuda->cuMemFree(profile->b_dev); + } + if (profile->a_dev) { + cuda->cuMemFree(profile->a_dev); + } + if (profile->preference) { + cublas->cublasLtMatmulPreferenceDestroy(profile->preference); + } + if (profile->d_layout) { + cublas->cublasLtMatrixLayoutDestroy(profile->d_layout); + } + if (profile->c_layout) { + cublas->cublasLtMatrixLayoutDestroy(profile->c_layout); + } + if (profile->b_layout) { + cublas->cublasLtMatrixLayoutDestroy(profile->b_layout); + } + if (profile->a_layout) { + cublas->cublasLtMatrixLayoutDestroy(profile->a_layout); + } + if (profile->op_desc) { + cublas->cublasLtMatmulDescDestroy(profile->op_desc); + } + memset(profile, 0, sizeof(*profile)); +} + +static int prepare_profile(struct cublaslt_api *cublas, + cublasLtHandle_t handle, + struct cuda_api *cuda, + const struct profile_desc *desc, + size_t profile_budget_bytes, + struct prepared_profile *out) { + memset(out, 0, sizeof(*out)); + out->desc = *desc; + + size_t bytes_per_cell = 0; + bytes_per_cell += bytes_for_elements(desc->a_type, 1); + bytes_per_cell += bytes_for_elements(desc->b_type, 1); + bytes_per_cell += bytes_for_elements(desc->c_type, 1); + bytes_per_cell += bytes_for_elements(desc->d_type, 1); + if (bytes_per_cell == 0) { + return 0; + } + + uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple); + out->m = dim; + out->n = dim; + out->k = dim; + + size_t desired_workspace = profile_budget_bytes / 8u; + if (desired_workspace > 32u * 1024u * 1024u) { + desired_workspace = 32u * 1024u * 1024u; + } + desired_workspace = round_down_size(desired_workspace, 256u); + + size_t a_bytes = 0; + size_t b_bytes = 0; + size_t c_bytes = 0; + size_t d_bytes = 0; + size_t scale_bytes = 0; + while (1) { + a_bytes = bytes_for_elements(desc->a_type, out->k * out->m); + b_bytes = bytes_for_elements(desc->b_type, out->k * out->n); + c_bytes = bytes_for_elements(desc->c_type, out->m * out->n); + d_bytes = bytes_for_elements(desc->d_type, out->m * out->n); + scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k); + + size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes; + if (matrix_bytes <= profile_budget_bytes) { + size_t remaining = profile_budget_bytes - matrix_bytes; + out->workspace_size = desired_workspace; + if (out->workspace_size > remaining) { + out->workspace_size = round_down_size(remaining, 256u); + } + break; + } + + if (out->m <= (uint64_t)desc->min_multiple) { + return 0; + } + out->m -= (uint64_t)desc->min_multiple; + out->n = out->m; + out->k = out->m; + } + + if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) || + !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) || + !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) || + !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) { + destroy_profile(cublas, cuda, out); + return 0; + } + + if (!check_cublas("cublasLtMatmulDescCreate", + cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) { + destroy_profile(cublas, cuda, out); + return 0; + } + + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + if (!check_cublas("set TRANSA", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_TRANSA, + &transa, + sizeof(transa))) || + !check_cublas("set TRANSB", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_TRANSB, + &transb, + sizeof(transb)))) { + destroy_profile(cublas, cuda, out); + return 0; + } + + if (desc->needs_scalar_scale) { + float one = 1.0f; + if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) || + !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) { + destroy_profile(cublas, cuda, out); + return 0; + } + if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) || + !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) { + destroy_profile(cublas, cuda, out); + return 0; + } + void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev; + void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev; + if (!check_cublas("set A scale ptr", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, + &a_scale_ptr, + sizeof(a_scale_ptr))) || + !check_cublas("set B scale ptr", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, + &b_scale_ptr, + sizeof(b_scale_ptr)))) { + destroy_profile(cublas, cuda, out); + return 0; + } + } + +#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) + if (desc->needs_block_scale) { + size_t a_scale_bytes = fp4_scale_bytes(out->k, out->m); + size_t b_scale_bytes = fp4_scale_bytes(out->k, out->n); + if (!alloc_filled(cuda, &out->a_scale_dev, a_scale_bytes, 0x11) || + !alloc_filled(cuda, &out->b_scale_dev, b_scale_bytes, 0x11)) { + destroy_profile(cublas, cuda, out); + return 0; + } + cublasLtMatmulMatrixScale_t scale_mode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3; + void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev; + void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev; + if (!check_cublas("set A scale mode", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_A_SCALE_MODE, + &scale_mode, + sizeof(scale_mode))) || + !check_cublas("set B scale mode", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_B_SCALE_MODE, + &scale_mode, + sizeof(scale_mode))) || + !check_cublas("set A block scale ptr", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, + &a_scale_ptr, + sizeof(a_scale_ptr))) || + !check_cublas("set B block scale ptr", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, + &b_scale_ptr, + sizeof(b_scale_ptr)))) { + destroy_profile(cublas, cuda, out); + return 0; + } + } +#endif + + if (!check_cublas("create A layout", + cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) || + !check_cublas("create B layout", + cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) || + !check_cublas("create C layout", + cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) || + !check_cublas("create D layout", + cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) { + destroy_profile(cublas, cuda, out); + return 0; + } + + if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) { + destroy_profile(cublas, cuda, out); + return 0; + } + + if (out->workspace_size > 0) { + if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) { + destroy_profile(cublas, cuda, out); + return 0; + } + } + + if (!check_cublas("set workspace", + cublas->cublasLtMatmulPreferenceSetAttribute( + out->preference, + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &out->workspace_size, + sizeof(out->workspace_size)))) { + destroy_profile(cublas, cuda, out); + return 0; + } + + int found = 0; + if (!check_cublas("heuristic", + cublas->cublasLtMatmulAlgoGetHeuristic(handle, + out->op_desc, + out->a_layout, + out->b_layout, + out->c_layout, + out->d_layout, + out->preference, + 1, + &out->heuristic, + &found))) { + destroy_profile(cublas, cuda, out); + return 0; + } + if (found <= 0) { + destroy_profile(cublas, cuda, out); + return 0; + } + + out->ready = 1; + return 1; +} + +static int run_cublas_profile(cublasLtHandle_t handle, + struct cublaslt_api *cublas, + struct prepared_profile *profile) { + float alpha = 1.0f; + float beta = 0.0f; + return check_cublas(profile->desc.name, + cublas->cublasLtMatmul(handle, + profile->op_desc, + &alpha, + (const void *)(uintptr_t)profile->a_dev, + profile->a_layout, + (const void *)(uintptr_t)profile->b_dev, + profile->b_layout, + &beta, + (const void *)(uintptr_t)profile->c_dev, + profile->c_layout, + (void *)(uintptr_t)profile->d_dev, + profile->d_layout, + &profile->heuristic.algo, + (void *)(uintptr_t)profile->workspace_dev, + profile->workspace_size, + (cudaStream_t)0)); +} + +static int run_cublaslt_stress(struct cuda_api *cuda, + CUdevice dev, + const char *device_name, + int cc_major, + int cc_minor, + int seconds, + int size_mb, + struct stress_report *report) { + struct cublaslt_api cublas; + struct prepared_profile prepared[sizeof(k_profiles) / sizeof(k_profiles[0])]; + cublasLtHandle_t handle = NULL; + CUcontext ctx = NULL; + uint16_t sample[256]; + int cc = cc_major * 10 + cc_minor; + int planned = 0; + int active = 0; + + memset(report, 0, sizeof(*report)); + snprintf(report->backend, sizeof(report->backend), "cublasLt"); + snprintf(report->device, sizeof(report->device), "%s", device_name); + report->cc_major = cc_major; + report->cc_minor = cc_minor; + report->buffer_mb = size_mb; + + if (!load_cublaslt(&cublas)) { + snprintf(report->details, sizeof(report->details), "cublasLt=unavailable\n"); + return 0; + } + if (!check_rc(cuda, "cuCtxCreate", cuda->cuCtxCreate(&ctx, 0, dev))) { + return 0; + } + if (!check_cublas("cublasLtCreate", cublas.cublasLtCreate(&handle))) { + cuda->cuCtxDestroy(ctx); + return 0; + } + + for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { + if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) { + planned++; + } + } + if (planned <= 0) { + snprintf(report->details, sizeof(report->details), "cublasLt_profiles=unsupported\n"); + cublas.cublasLtDestroy(handle); + cuda->cuCtxDestroy(ctx); + return 0; + } + + size_t total_budget = (size_t)size_mb * 1024u * 1024u; + if (total_budget < (size_t)planned * 4u * 1024u * 1024u) { + total_budget = (size_t)planned * 4u * 1024u * 1024u; + } + size_t per_profile_budget = total_budget / (size_t)planned; + if (per_profile_budget < 4u * 1024u * 1024u) { + per_profile_budget = 4u * 1024u * 1024u; + } + + for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { + const struct profile_desc *desc = &k_profiles[i]; + if (!(desc->enabled && cc >= desc->min_cc)) { + append_detail(report->details, + sizeof(report->details), + "%s=SKIPPED cc<%d\n", + desc->name, + desc->min_cc); + continue; + } + if (prepare_profile(&cublas, handle, cuda, desc, per_profile_budget, &prepared[i])) { + active++; + append_detail(report->details, + sizeof(report->details), + "%s=READY dim=%llux%llux%llu block=%s\n", + desc->name, + (unsigned long long)prepared[i].m, + (unsigned long long)prepared[i].n, + (unsigned long long)prepared[i].k, + desc->block_label); + } else { + append_detail(report->details, sizeof(report->details), "%s=SKIPPED unsupported\n", desc->name); + } + } + + if (active <= 0) { + cublas.cublasLtDestroy(handle); + cuda->cuCtxDestroy(ctx); + return 0; + } + + double deadline = now_seconds() + (double)seconds; + while (now_seconds() < deadline) { + for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) { + if (!prepared[i].ready) { + continue; + } + if (!run_cublas_profile(handle, &cublas, &prepared[i])) { + append_detail(report->details, + sizeof(report->details), + "%s=FAILED runtime\n", + prepared[i].desc.name); + for (size_t j = 0; j < sizeof(prepared) / sizeof(prepared[0]); j++) { + destroy_profile(&cublas, cuda, &prepared[j]); + } + cublas.cublasLtDestroy(handle); + cuda->cuCtxDestroy(ctx); + return 0; + } + prepared[i].iterations++; + report->iterations++; + if (now_seconds() >= deadline) { + break; + } + } + } + + if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) { + for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) { + destroy_profile(&cublas, cuda, &prepared[i]); + } + cublas.cublasLtDestroy(handle); + cuda->cuCtxDestroy(ctx); + return 0; + } + + for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) { + if (!prepared[i].ready) { + continue; + } + append_detail(report->details, + sizeof(report->details), + "%s_iterations=%lu\n", + prepared[i].desc.name, + prepared[i].iterations); + } + + for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) { + if (prepared[i].ready) { + if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) { + for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) { + report->checksum += sample[j]; + } + } + break; + } + } + + for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) { + destroy_profile(&cublas, cuda, &prepared[i]); + } + cublas.cublasLtDestroy(handle); + cuda->cuCtxDestroy(ctx); + return 1; +} +#endif + int main(int argc, char **argv) { int seconds = 5; int size_mb = 64; @@ -181,20 +1112,20 @@ int main(int argc, char **argv) { size_mb = 64; } - struct cuda_api api; - if (!load_cuda(&api)) { + struct cuda_api cuda; + if (!load_cuda(&cuda)) { fprintf(stderr, "failed to load libcuda.so.1 or required Driver API symbols\n"); return 1; } - load_symbol(api.lib, "cuGetErrorName", (void **)&api.cuGetErrorName); - load_symbol(api.lib, "cuGetErrorString", (void **)&api.cuGetErrorString); + load_symbol(cuda.lib, "cuGetErrorName", (void **)&cuda.cuGetErrorName); + load_symbol(cuda.lib, "cuGetErrorString", (void **)&cuda.cuGetErrorString); - if (!check_rc(&api, "cuInit", api.cuInit(0))) { + if (!check_rc(&cuda, "cuInit", cuda.cuInit(0))) { return 1; } int count = 0; - if (!check_rc(&api, "cuDeviceGetCount", api.cuDeviceGetCount(&count))) { + if (!check_rc(&cuda, "cuDeviceGetCount", cuda.cuDeviceGetCount(&count))) { return 1; } if (count <= 0) { @@ -203,112 +1134,43 @@ int main(int argc, char **argv) { } CUdevice dev = 0; - if (!check_rc(&api, "cuDeviceGet", api.cuDeviceGet(&dev, 0))) { + if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) { return 1; } + char name[128] = {0}; - if (!check_rc(&api, "cuDeviceGetName", api.cuDeviceGetName(name, (int)sizeof(name), dev))) { + if (!check_rc(&cuda, "cuDeviceGetName", cuda.cuDeviceGetName(name, (int)sizeof(name), dev))) { return 1; } - CUcontext ctx = NULL; - if (!check_rc(&api, "cuCtxCreate", api.cuCtxCreate(&ctx, 0, dev))) { + int cc_major = 0; + int cc_minor = 0; + if (!query_compute_capability(&cuda, dev, &cc_major, &cc_minor)) { return 1; } - size_t bytes = (size_t)size_mb * 1024 * 1024; - uint32_t words = (uint32_t)(bytes / sizeof(uint32_t)); - if (words < 1024) { - words = 1024; - bytes = (size_t)words * sizeof(uint32_t); - } + struct stress_report report; + int ok = 0; - uint32_t *host = (uint32_t *)malloc(bytes); - if (!host) { - fprintf(stderr, "malloc failed\n"); - api.cuCtxDestroy(ctx); - return 1; - } - for (uint32_t i = 0; i < words; i++) { - host[i] = i ^ 0x12345678u; - } - - CUdeviceptr device_mem = 0; - if (!check_rc(&api, "cuMemAlloc", api.cuMemAlloc(&device_mem, bytes))) { - free(host); - api.cuCtxDestroy(ctx); - return 1; - } - if (!check_rc(&api, "cuMemcpyHtoD", api.cuMemcpyHtoD(device_mem, host, bytes))) { - api.cuMemFree(device_mem); - free(host); - api.cuCtxDestroy(ctx); - return 1; - } - - CUmodule module = NULL; - if (!check_rc(&api, "cuModuleLoadDataEx", api.cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) { - api.cuMemFree(device_mem); - free(host); - api.cuCtxDestroy(ctx); - return 1; - } - - CUfunction kernel = NULL; - if (!check_rc(&api, "cuModuleGetFunction", api.cuModuleGetFunction(&kernel, module, "burn"))) { - api.cuMemFree(device_mem); - free(host); - api.cuCtxDestroy(ctx); - return 1; - } - - unsigned int threads = 256; - unsigned int blocks = (words + threads - 1) / threads; - uint32_t rounds = 256; - void *params[] = {&device_mem, &words, &rounds}; - - double start = now_seconds(); - double deadline = start + (double)seconds; - unsigned long iterations = 0; - while (now_seconds() < deadline) { - if (!check_rc(&api, "cuLaunchKernel", - api.cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) { - api.cuMemFree(device_mem); - free(host); - api.cuCtxDestroy(ctx); +#if HAVE_CUBLASLT_HEADERS + ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report); +#endif + if (!ok) { + if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) { return 1; } - iterations++; } - if (!check_rc(&api, "cuCtxSynchronize", api.cuCtxSynchronize())) { - api.cuMemFree(device_mem); - free(host); - api.cuCtxDestroy(ctx); - return 1; + printf("device=%s\n", report.device); + printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor); + printf("backend=%s\n", report.backend); + printf("duration_s=%d\n", seconds); + printf("buffer_mb=%d\n", report.buffer_mb); + printf("iterations=%lu\n", report.iterations); + printf("checksum=%llu\n", (unsigned long long)report.checksum); + if (report.details[0] != '\0') { + printf("%s", report.details); } - if (!check_rc(&api, "cuMemcpyDtoH", api.cuMemcpyDtoH(host, device_mem, bytes))) { - api.cuMemFree(device_mem); - free(host); - api.cuCtxDestroy(ctx); - return 1; - } - - uint64_t checksum = 0; - for (uint32_t i = 0; i < words; i += words / 256 ? words / 256 : 1) { - checksum += host[i]; - } - - double elapsed = now_seconds() - start; - printf("device=%s\n", name); - printf("duration_s=%.2f\n", elapsed); - printf("buffer_mb=%d\n", size_mb); - printf("iterations=%lu\n", iterations); - printf("checksum=%llu\n", (unsigned long long)checksum); printf("status=OK\n"); - - api.cuMemFree(device_mem); - free(host); - api.cuCtxDestroy(ctx); return 0; } diff --git a/iso/builder/build-cublas.sh b/iso/builder/build-cublas.sh new file mode 100644 index 0000000..dc9aab6 --- /dev/null +++ b/iso/builder/build-cublas.sh @@ -0,0 +1,170 @@ +#!/bin/sh +# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress. +# +# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64), +# verifies them against Packages.gz, and extracts the small subset we need: +# - headers for compiling bee-gpu-stress against cuBLASLt +# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO + +set -e + +CUBLAS_VERSION="$1" +CUDA_USERSPACE_VERSION="$2" +CUDA_SERIES="$3" +DIST_DIR="$4" + +[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 "; exit 1; } +[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 "; exit 1; } +[ -n "$CUDA_SERIES" ] || { echo "usage: $0 "; exit 1; } +[ -n "$DIST_DIR" ] || { echo "usage: $0 "; exit 1; } + +CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-') +REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64" +CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}" +CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}" +DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads" +PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz" + +echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ===" + +if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \ + && [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then + echo "=== cuBLAS cached, skipping download ===" + echo "cache: $CACHE_DIR" + exit 0 +fi + +mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib" + +echo "=== downloading Packages.gz ===" +wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz" + +lookup_pkg() { + pkg="$1" + ver="$2" + gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" ' + /^Package: / { cur_pkg=$2 } + /^Version: / { cur_ver=$2 } + /^Filename: / { cur_file=$2 } + /^SHA256: / { cur_sha=$2 } + /^$/ { + if (cur_pkg == pkg && cur_ver == ver) { + print cur_file " " cur_sha + exit + } + cur_pkg=""; cur_ver=""; cur_file=""; cur_sha="" + } + END { + if (cur_pkg == pkg && cur_ver == ver) { + print cur_file " " cur_sha + } + }' +} + +download_verified_pkg() { + pkg="$1" + ver="$2" + + meta="$(lookup_pkg "$pkg" "$ver")" + [ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; } + + repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')" + repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')" + [ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; } + [ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; } + + out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")" + if [ -f "$out" ]; then + actual_sha="$(sha256sum "$out" | awk '{print $1}')" + if [ "$actual_sha" = "$repo_sha" ]; then + echo "=== using cached $(basename "$repo_file") ===" + printf '%s\n' "$out" + return 0 + fi + echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ===" + rm -f "$out" + fi + + echo "=== downloading $(basename "$repo_file") ===" + wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")" + + actual_sha="$(sha256sum "$out" | awk '{print $1}')" + if [ "$actual_sha" != "$repo_sha" ]; then + echo "ERROR: sha256 mismatch for $(basename "$repo_file")" + echo " expected: $repo_sha" + echo " actual: $actual_sha" + rm -f "$out" + exit 1 + fi + echo "sha256 OK: $(basename "$repo_file")" + printf '%s\n' "$out" +} + +extract_deb() { + deb="$1" + dst="$2" + mkdir -p "$dst" + ( + cd "$dst" + ar x "$deb" + data_tar=$(ls data.tar.* 2>/dev/null | head -1) + [ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; } + tar xf "$data_tar" + ) +} + +copy_headers() { + from="$1" + if [ -d "${from}/usr/include" ]; then + cp -a "${from}/usr/include/." "${CACHE_DIR}/include/" + fi +} + +copy_libs() { + from="$1" + find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \ + \( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \; +} + +make_links() { + base="$1" + versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1) + [ -n "$versioned" ] || return 0 + soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#") + target=$(basename "$versioned") + ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true + ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true +} + +TMP_DIR=$(mktemp -d) +trap 'rm -rf "$TMP_DIR"' EXIT INT TERM + +CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}") +CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}") +CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}") +CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}") + +extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt" +extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev" +extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt" +extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev" + +copy_headers "${TMP_DIR}/cublas-dev" +copy_headers "${TMP_DIR}/cudart-dev" +copy_libs "${TMP_DIR}/cublas-rt" +copy_libs "${TMP_DIR}/cudart-rt" + +make_links "libcublas" +make_links "libcublasLt" +make_links "libcudart" + +[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; } +[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; } +[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; } +[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; } +[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; } + +echo "=== cuBLAS extraction complete ===" +echo "cache: $CACHE_DIR" +echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)" +echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)" diff --git a/iso/builder/build-in-container.sh b/iso/builder/build-in-container.sh index 26940bc..2b2ef70 100755 --- a/iso/builder/build-in-container.sh +++ b/iso/builder/build-in-container.sh @@ -7,6 +7,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" BUILDER_DIR="${REPO_ROOT}/iso/builder" CONTAINER_TOOL="${CONTAINER_TOOL:-docker}" IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}" +BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}" CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}" AUTH_KEYS="" REBUILD_IMAGE=0 @@ -40,6 +41,13 @@ if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then exit 1 fi +PLATFORM_OS="${BUILDER_PLATFORM%/*}" +PLATFORM_ARCH="${BUILDER_PLATFORM#*/}" +if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then + echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2 + exit 1 +fi + if [ -n "$AUTH_KEYS" ]; then [ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; } AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")" @@ -56,17 +64,35 @@ mkdir -p \ IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}" -if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then +image_matches_platform() { + actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)" + [ "$actual_platform" = "${BUILDER_PLATFORM}" ] +} + +NEED_BUILD_IMAGE=0 +if [ "$REBUILD_IMAGE" = "1" ]; then + NEED_BUILD_IMAGE=1 +elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then + NEED_BUILD_IMAGE=1 +elif ! image_matches_platform; then + actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)" + echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ===" + NEED_BUILD_IMAGE=1 +fi + +if [ "$NEED_BUILD_IMAGE" = "1" ]; then "$CONTAINER_TOOL" build \ + --platform "${BUILDER_PLATFORM}" \ --build-arg GO_VERSION="${GO_VERSION}" \ -t "${IMAGE_REF}" \ "${BUILDER_DIR}" else - echo "=== using existing builder image ${IMAGE_REF} ===" + echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ===" fi set -- \ run --rm --privileged \ + --platform "${BUILDER_PLATFORM}" \ -v "${REPO_ROOT}:/work" \ -v "${CACHE_DIR}:/cache" \ -e BEE_CONTAINER_BUILD=1 \ @@ -80,6 +106,7 @@ set -- \ if [ -n "$AUTH_KEYS" ]; then set -- run --rm --privileged \ + --platform "${BUILDER_PLATFORM}" \ -v "${REPO_ROOT}:/work" \ -v "${CACHE_DIR}:/cache" \ -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \ diff --git a/iso/builder/build.sh b/iso/builder/build.sh index a5611fa..5ec4120 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -159,6 +159,16 @@ else echo "=== bee binary up to date, skipping build ===" fi +echo "" +echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ===" +sh "${BUILDER_DIR}/build-cublas.sh" \ + "${CUBLAS_VERSION}" \ + "${CUDA_USERSPACE_VERSION}" \ + "${NCCL_CUDA_VERSION}" \ + "${DIST_DIR}" + +CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}" + GPU_STRESS_NEED_BUILD=1 if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then GPU_STRESS_NEED_BUILD=0 @@ -167,6 +177,7 @@ fi if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then echo "=== building bee-gpu-stress ===" gcc -O2 -s -Wall -Wextra \ + -I"${CUBLAS_CACHE}/include" \ -o "$GPU_STRESS_BIN" \ "${BUILDER_DIR}/bee-gpu-stress.c" \ -ldl @@ -283,6 +294,10 @@ NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" +# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path +cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" +echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" + # --- embed build metadata --- mkdir -p "${OVERLAY_STAGE_DIR}/etc" BUILD_DATE="$(date +%Y-%m-%d)" @@ -297,6 +312,8 @@ DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI} NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION} NCCL_VERSION=${NCCL_VERSION} NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION} +CUBLAS_VERSION=${CUBLAS_VERSION} +CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION} EOF # Patch motd with build info diff --git a/iso/builder/config/package-lists/bee.list.chroot b/iso/builder/config/package-lists/bee.list.chroot index 1d9ee0e..654f796 100644 --- a/iso/builder/config/package-lists/bee.list.chroot +++ b/iso/builder/config/package-lists/bee.list.chroot @@ -20,6 +20,7 @@ openssh-server # Filesystem support for USB export targets exfatprogs +exfat-fuse ntfs-3g # Utilities