Compare commits

..

4 Commits

Author SHA1 Message Date
Mikhail Chusavitin
fc5c2019aa iso: improve burn-in, export, and live boot 2026-03-26 18:56:19 +03:00
Mikhail Chusavitin
67a215c66f fix(iso): route kernel logs to tty2, keep tty1 clean for TUI
console=tty0 sent kernel messages to the active VT (tty1), overwriting
the TUI. Changed to console=tty2 so kernel logs land on a dedicated
console. tty1 is now clean; operator can press Alt+F2 to inspect kernel
messages and Alt+F3 for an extra shell.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 17:40:44 +03:00
Mikhail Chusavitin
8b4bfdf5ad feat(tui): live GPU chart during stress test, full VRAM allocation
- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop.
  nvidia-smi is polled every second; up to 60 data points per GPU kept.
  All three metrics (Usage %, Temp °C, Power W) drawn on a single plot,
  each normalised to its own range and rendered in a different colour.
- Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM
  minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 17:37:20 +03:00
Mikhail Chusavitin
0a52a4f3ba fix(iso): restore loglevel=7 on VGA console for crash visibility
loglevel=3 was hiding all kernel messages on tty0/ttyS0 except errors.
Machine crashes (panics, driver oops, module failures) were silent on VGA.

Restored loglevel=7 so kernel messages up to debug are printed to both
tty0 (VGA) and ttyS0 (SOL). Journald MaxLevelConsole reduced to info
(was debug) to reduce noise on SOL while keeping it useful.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 11:19:07 +03:00
27 changed files with 1928 additions and 193 deletions

View File

@@ -231,8 +231,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportLatestAudit(target)
body := "Audit exported."
if path != "" {
body := "Audit export failed."
if err == nil {
body = "Audit exported."
}
if err == nil && path != "" {
body = "Audit exported to " + path
}
return ActionResult{Title: "Export audit", Body: body}, err
@@ -249,8 +252,11 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportSupportBundle(target)
body := "Support bundle exported. USB target unmounted and safe to remove."
if path != "" {
body := "Support bundle export failed."
if err == nil {
body = "Support bundle exported. USB target unmounted and safe to remove."
}
if err == nil && path != "" {
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
}
return ActionResult{Title: "Export support bundle", Body: body}, err

View File

@@ -470,6 +470,41 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
}
}
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
oldExportDir := DefaultExportDir
DefaultExportDir = tmp
t.Cleanup(func() { DefaultExportDir = oldExportDir })
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
t.Fatalf("write bee-audit.json: %v", err)
}
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
t.Fatalf("write bee-audit.log: %v", err)
}
a := &App{
exports: fakeExports{
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
},
},
}
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
if err == nil {
t.Fatal("expected export error")
}
if contains(result.Body, "exported to") {
t.Fatalf("body should not claim success:\n%s", result.Body)
}
if result.Body != "Support bundle export failed." {
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
}
}
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
t.Parallel()

View File

@@ -11,8 +11,48 @@ import (
var exportExecCommand = exec.Command
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
msg := strings.TrimSpace(raw)
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
}
if msg == "" {
return err
}
return fmt.Errorf("%s: %w", msg, err)
}
func removableTargetReadOnly(fields map[string]string) bool {
if fields["RO"] == "1" {
return true
}
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
case "iso9660", "squashfs":
return true
default:
return false
}
}
func ensureWritableMountpoint(mountpoint string) error {
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
if err != nil {
return fmt.Errorf("target filesystem is not writable: %w", err)
}
name := probe.Name()
if closeErr := probe.Close(); closeErr != nil {
_ = os.Remove(name)
return closeErr
}
if err := os.Remove(name); err != nil {
return err
}
return nil
}
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
if err != nil {
return nil, err
}
@@ -36,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
}
}
}
if !removable || fields["FSTYPE"] == "" {
if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
continue
}
@@ -72,7 +112,7 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
}
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
_ = os.Remove(mountpoint)
return string(raw), err
return "", formatMountTargetError(target, string(raw), err)
}
mountedHere = true
mounted = true
@@ -95,6 +135,10 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
}
}()
if err := ensureWritableMountpoint(mountpoint); err != nil {
return "", err
}
filename := filepath.Base(src)
dst = filepath.Join(mountpoint, filename)
data, err := os.ReadFile(src)

View File

@@ -4,12 +4,11 @@ import (
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
)
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt")
@@ -54,3 +53,60 @@ func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
}
}
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt")
if err := os.MkdirAll(mountpoint, 0755); err != nil {
t.Fatalf("mkdir mountpoint: %v", err)
}
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
t.Fatalf("write src: %v", err)
}
if err := os.Chmod(mountpoint, 0555); err != nil {
t.Fatalf("chmod mountpoint: %v", err)
}
oldExec := exportExecCommand
exportExecCommand = func(name string, args ...string) *exec.Cmd {
return exec.Command("sh", "-c", "exit 0")
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
_, err := s.ExportFileToTarget(src, RemovableTarget{
Device: "/dev/sdb1",
Mountpoint: mountpoint,
})
if err == nil {
t.Fatal("expected error for non-writable mountpoint")
}
if !strings.Contains(err.Error(), "target filesystem is not writable") {
t.Fatalf("err=%q want writable message", err)
}
}
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
oldExec := exportExecCommand
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
exportExecCommand = func(name string, args ...string) *exec.Cmd {
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
return cmd
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
targets, err := s.ListRemovableTargets()
if err != nil {
t.Fatalf("ListRemovableTargets error: %v", err)
}
if len(targets) != 1 {
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
}
if got := targets[0].Device; got != "/dev/sdb1" {
t.Fatalf("device=%q want /dev/sdb1", got)
}
}

View File

@@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 {
return v
}
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
return sampleGPUMetrics(gpuIndices)
}
// WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer
@@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
return strings.TrimRight(b.String(), "\n")
}
// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
// Each series is normalised to its own minmax and drawn in a different colour.
// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
if chartWidth < 20 {
chartWidth = 70
}
const chartHeight = 14
seen := make(map[int]bool)
var order []int
gpuMap := make(map[int][]GPUMetricRow)
for _, r := range rows {
if !seen[r.GPUIndex] {
seen[r.GPUIndex] = true
order = append(order, r.GPUIndex)
}
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
}
type seriesDef struct {
label string
color string
unit string
fn func(GPUMetricRow) float64
}
defs := []seriesDef{
{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
}
var b strings.Builder
for _, gpuIdx := range order {
gr := gpuMap[gpuIdx]
if len(gr) == 0 {
continue
}
elapsed := gr[len(gr)-1].ElapsedSec
// Build value slices for each series.
type seriesData struct {
seriesDef
vals []float64
mn float64
mx float64
}
var series []seriesData
for _, d := range defs {
vals := extractGPUField(gr, d.fn)
mn, mx := gpuMinMax(vals)
if mn == mx {
mx = mn + 1
}
series = append(series, seriesData{d, vals, mn, mx})
}
// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
type cell struct {
ch rune
color string
}
grid := make([][]cell, chartHeight+1)
for r := range grid {
grid[r] = make([]cell, chartWidth)
for c := range grid[r] {
grid[r][c] = cell{' ', ""}
}
}
// Plot each series onto the shared grid.
for _, s := range series {
w := chartWidth
if len(s.vals) < w {
w = len(s.vals)
}
data := gpuDownsample(s.vals, w)
prevRow := -1
for x, v := range data {
row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
if row < 0 {
row = 0
}
if row > chartHeight {
row = chartHeight
}
if prevRow < 0 || prevRow == row {
grid[row][x] = cell{'─', s.color}
} else {
lo, hi := prevRow, row
if lo > hi {
lo, hi = hi, lo
}
for y := lo + 1; y < hi; y++ {
grid[y][x] = cell{'│', s.color}
}
if prevRow < row {
grid[prevRow][x] = cell{'╮', s.color}
grid[row][x] = cell{'╰', s.color}
} else {
grid[prevRow][x] = cell{'╯', s.color}
grid[row][x] = cell{'╭', s.color}
}
}
prevRow = row
}
}
// Render: Y axis + data rows.
fmt.Fprintf(&b, "GPU %d (%.0fs) each series normalised to its range\n", gpuIdx, elapsed)
for r := 0; r <= chartHeight; r++ {
// Y axis label: 100% at top, 50% in middle, 0% at bottom.
switch r {
case 0:
fmt.Fprintf(&b, "%4s┤", "100%")
case chartHeight / 2:
fmt.Fprintf(&b, "%4s┤", "50%")
case chartHeight:
fmt.Fprintf(&b, "%4s┤", "0%")
default:
fmt.Fprintf(&b, "%4s│", "")
}
for c := 0; c < chartWidth; c++ {
cl := grid[r][c]
if cl.color != "" {
b.WriteString(cl.color)
b.WriteRune(cl.ch)
b.WriteString(ansiReset)
} else {
b.WriteRune(' ')
}
}
b.WriteRune('\n')
}
// Bottom axis.
b.WriteString(" └")
b.WriteString(strings.Repeat("─", chartWidth))
b.WriteRune('\n')
// Legend with current (last) values.
b.WriteString(" ")
for i, s := range series {
last := s.vals[len(s.vals)-1]
b.WriteString(s.color)
fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
b.WriteString(ansiReset)
if i < len(series)-1 {
b.WriteString(" ")
}
}
b.WriteRune('\n')
}
return strings.TrimRight(b.String(), "\n")
}
// renderLineChart draws a single time-series line chart using box-drawing characters.
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
func renderLineChart(vals []float64, color, caption string, height, width int) string {

View File

@@ -151,8 +151,10 @@ func (m model) confirmCancelTarget() screen {
switch m.pendingAction {
case actionExportBundle:
return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
return screenHealthCheck
case actionRunFanStress:
return screenBurnInTests
default:
return screenMain
}
@@ -165,9 +167,9 @@ func hcFanStressOpts(hcMode int, application interface {
// Phase durations per mode: [baseline, load1, pause, load2]
type durations struct{ baseline, load1, pause, load2 int }
modes := [3]durations{
{30, 120, 30, 120}, // Quick: ~5 min total
{60, 300, 60, 300}, // Standard: ~12 min total
{60, 600, 120, 600}, // Express: ~24 min total
{30, 120, 30, 120}, // Quick: ~5 min total
{60, 300, 60, 300}, // Standard: ~12 min total
{60, 600, 120, 600}, // Express: ~24 min total
}
if hcMode < 0 || hcMode >= len(modes) {
hcMode = 0
@@ -182,12 +184,13 @@ func hcFanStressOpts(hcMode int, application interface {
}
}
// Use minimum GPU memory size to fit all GPUs.
// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
sizeMB := 64
if gpus, err := application.ListNvidiaGPUs(); err == nil {
for _, g := range gpus {
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
free := g.MemoryMB - 512
if free > 0 && (sizeMB == 64 || free < sizeMB) {
sizeMB = free
}
}
}

View File

@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
body string
err error
}
type gpuLiveTickMsg struct {
rows []platform.GPUMetricRow
indices []int
}

View File

@@ -0,0 +1,117 @@
package tui
import (
"fmt"
"strings"
tea "github.com/charmbracelet/bubbletea"
)
const (
burnCurGPUStress = 0
burnCurModeQuick = 1
burnCurModeStd = 2
burnCurModeExpr = 3
burnCurRun = 4
burnCurTotal = 5
)
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
m.screen = screenBurnInTests
m.cursor = 0
if !m.burnInitialized {
m.burnMode = 0
m.burnCursor = 0
m.burnInitialized = true
}
return m, nil
}
func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
switch msg.String() {
case "up", "k":
if m.burnCursor > 0 {
m.burnCursor--
}
case "down", "j":
if m.burnCursor < burnCurTotal-1 {
m.burnCursor++
}
case " ":
switch m.burnCursor {
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
m.burnMode = m.burnCursor - burnCurModeQuick
}
case "enter":
switch m.burnCursor {
case burnCurGPUStress, burnCurRun:
return m.burnRunSelected()
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
m.burnMode = m.burnCursor - burnCurModeQuick
}
case "f", "F", "r", "R":
return m.burnRunSelected()
case "1":
m.burnMode = 0
case "2":
m.burnMode = 1
case "3":
m.burnMode = 2
case "esc":
m.screen = screenMain
m.cursor = 1
case "q", "ctrl+c":
return m, tea.Quit
}
return m, nil
}
func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
return m.hcRunFanStress()
}
func renderBurnInTests(m model) string {
var b strings.Builder
fmt.Fprintln(&b, "BURN-IN TESTS")
fmt.Fprintln(&b)
fmt.Fprintln(&b, " Stress tests:")
fmt.Fprintln(&b)
pfx := " "
if m.burnCursor == burnCurGPUStress {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
fmt.Fprintln(&b)
fmt.Fprintln(&b, " Mode:")
modes := []struct{ label, key string }{
{"Quick", "1"},
{"Standard", "2"},
{"Express", "3"},
}
for i, mode := range modes {
pfx := " "
if m.burnCursor == burnCurModeQuick+i {
pfx = "> "
}
radio := "( )"
if m.burnMode == i {
radio = "(*)"
}
fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key)
}
fmt.Fprintln(&b)
pfx = " "
if m.burnCursor == burnCurRun {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back")
return b.String()
}

View File

@@ -4,7 +4,12 @@ import tea "github.com/charmbracelet/bubbletea"
func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
if len(m.targets) == 0 {
return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain)
return m, resultCmd(
"Export support bundle",
"No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list.",
nil,
screenMain,
)
}
target := m.targets[m.cursor]
m.selectedTarget = &target

View File

@@ -3,8 +3,10 @@ package tui
import (
"context"
"fmt"
"os/exec"
"strings"
"time"
"bee/audit/internal/platform"
tea "github.com/charmbracelet/bubbletea"
)
@@ -19,17 +21,16 @@ const (
// Cursor positions in Health Check screen.
const (
hcCurGPU = 0
hcCurMemory = 1
hcCurStorage = 2
hcCurCPU = 3
hcCurSelectAll = 4
hcCurModeQuick = 5
hcCurModeStd = 6
hcCurModeExpr = 7
hcCurRunAll = 8
hcCurFanStress = 9
hcCurTotal = 10
hcCurGPU = 0
hcCurMemory = 1
hcCurStorage = 2
hcCurCPU = 3
hcCurSelectAll = 4
hcCurModeQuick = 5
hcCurModeStd = 6
hcCurModeExpr = 7
hcCurRunAll = 8
hcCurTotal = 9
)
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
@@ -84,8 +85,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
m.hcMode = m.hcCursor - hcCurModeQuick
case hcCurRunAll:
return m.hcRunAll()
case hcCurFanStress:
return m.hcRunFanStress()
}
case "g", "G":
return m.hcRunSingle(hcGPU)
@@ -97,8 +96,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.hcRunSingle(hcCPU)
case "r", "R":
return m.hcRunAll()
case "f", "F":
return m.hcRunFanStress()
case "a", "A":
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
for i := range m.hcSel {
@@ -156,14 +153,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
return m, nil
}
// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
// nvtop occupies the full terminal as a live chart; the stress test runs in background.
// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
opts := hcFanStressOpts(m.hcMode, m.app)
opts := hcFanStressOpts(m.burnMode, m.app)
ctx, cancel := context.WithCancel(context.Background())
m.gpuStressCancel = cancel
m.gpuStressAborted = false
m.gpuLiveRows = nil
m.gpuLiveIndices = opts.GPUIndices
m.gpuLiveStart = time.Now()
m.screen = screenGPUStressRunning
m.nvidiaSATCursor = 0
@@ -172,37 +171,29 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
}
nvtopPath, lookErr := exec.LookPath("nvtop")
if lookErr != nil {
return m, stressCmd
}
return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
}
return m, tea.Batch(
stressCmd,
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
return nvtopClosedMsg{}
}),
)
// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
// The update handler reschedules it to achieve continuous 1s polling.
func pollGPULive(indices []int) tea.Cmd {
return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
rows, _ := platform.SampleGPUMetrics(indices)
return gpuLiveTickMsg{rows: rows, indices: indices}
})
}
// updateGPUStressRunning handles keys on the GPU stress running screen.
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
switch msg.String() {
case "o", "O":
nvtopPath, err := exec.LookPath("nvtop")
if err != nil {
return m, nil
}
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
return nvtopClosedMsg{}
})
case "a", "A":
if m.gpuStressCancel != nil {
m.gpuStressCancel()
m.gpuStressCancel = nil
}
m.gpuStressAborted = true
m.screen = screenHealthCheck
m.screen = screenBurnInTests
m.burnCursor = burnCurGPUStress
m.cursor = 0
case "ctrl+c":
return m, tea.Quit
@@ -210,8 +201,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m, nil
}
func renderGPUStressRunning() string {
return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
func renderGPUStressRunning(m model) string {
var b strings.Builder
fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
fmt.Fprintln(&b)
if len(m.gpuLiveRows) == 0 {
fmt.Fprintln(&b, "Collecting metrics...")
} else {
chartWidth := m.width - 8
if chartWidth < 40 {
chartWidth = 70
}
b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
}
fmt.Fprintln(&b)
b.WriteString("[a] Abort test [ctrl+c] quit")
return b.String()
}
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
@@ -371,16 +376,8 @@ func renderHealthCheck(m model) string {
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
}
{
pfx := " "
if m.hcCursor == hcCurFanStress {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
}
fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] gpu stress [Esc] back")
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
return b.String()
}

View File

@@ -8,7 +8,9 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
switch m.cursor {
case 0: // Health Check
return m.enterHealthCheck()
case 1: // Export support bundle
case 1: // Burn-in tests
return m.enterBurnInTests()
case 2: // Export support bundle
m.pendingAction = actionExportBundle
m.busy = true
m.busyTitle = "Export support bundle"
@@ -16,11 +18,11 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
targets, err := m.app.ListRemovableTargets()
return exportTargetsMsg{targets: targets, err: err}
}
case 2: // Settings
case 3: // Settings
m.screen = screenSettings
m.cursor = 0
return m, nil
case 3: // Exit
case 4: // Exit
return m, tea.Quit
}
return m, nil

View File

@@ -54,9 +54,10 @@ func TestUpdateMainMenuEnterActions(t *testing.T) {
wantCmd bool
}{
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true},
{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
{name: "settings", cursor: 2, wantScreen: screenSettings, wantCmd: true},
{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true},
{name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests, wantCmd: true},
{name: "export", cursor: 2, wantScreen: screenMain, wantBusy: true, wantCmd: true},
{name: "settings", cursor: 3, wantScreen: screenSettings, wantCmd: true},
{name: "exit", cursor: 4, wantScreen: screenMain, wantCmd: true},
}
for _, test := range tests {
@@ -115,7 +116,8 @@ func TestMainMenuSimpleTransitions(t *testing.T) {
wantScreen screen
}{
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
{name: "settings", cursor: 2, wantScreen: screenSettings},
{name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests},
{name: "settings", cursor: 3, wantScreen: screenSettings},
}
for _, test := range tests {
@@ -146,7 +148,7 @@ func TestMainMenuExportSetsBusy(t *testing.T) {
t.Parallel()
m := newTestModel()
m.cursor = 1 // Export support bundle
m.cursor = 2 // Export support bundle
next, cmd := m.handleMainMenu()
got := next.(model)
@@ -163,12 +165,13 @@ func TestMainViewRendersTwoColumns(t *testing.T) {
t.Parallel()
m := newTestModel()
m.cursor = 1
m.cursor = 2
view := m.View()
for _, want := range []string{
"bee",
"Health Check",
"Burn-in tests",
"> Export support bundle",
"Settings",
"Exit",
@@ -400,6 +403,11 @@ func TestConfirmCancelTarget(t *testing.T) {
t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
}
m.pendingAction = actionRunFanStress
if got := m.confirmCancelTarget(); got != screenBurnInTests {
t.Fatalf("fan stress cancel target=%q want %q", got, screenBurnInTests)
}
m.pendingAction = actionNone
if got := m.confirmCancelTarget(); got != screenMain {
t.Fatalf("default cancel target=%q want %q", got, screenMain)
@@ -439,6 +447,68 @@ func TestViewBusyStateUsesBusyTitle(t *testing.T) {
}
}
func TestBurnInTestsEscReturnsToMain(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
m.burnCursor = 3
next, _ := m.updateBurnInTests(tea.KeyMsg{Type: tea.KeyEsc})
got := next.(model)
if got.screen != screenMain {
t.Fatalf("screen=%q want %q", got.screen, screenMain)
}
if got.cursor != 1 {
t.Fatalf("cursor=%d want 1", got.cursor)
}
}
func TestBurnInTestsRunOpensConfirm(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
m.burnInitialized = true
m.burnMode = 2
next, _ := m.burnRunSelected()
got := next.(model)
if got.screen != screenConfirm {
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
}
if got.pendingAction != actionRunFanStress {
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunFanStress)
}
if got.cursor != 0 {
t.Fatalf("cursor=%d want 0", got.cursor)
}
}
func TestViewBurnInTestsRendersGPUStressEntry(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
view := m.View()
for _, want := range []string{
"BURN-IN TESTS",
"GPU PLATFORM STRESS TEST",
"Quick",
"Standard",
"Express",
"[ RUN SELECTED [R] ]",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
t.Parallel()
@@ -528,7 +598,7 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
for _, want := range []string{
"Export support bundle",
"Select removable filesystem",
"Select writable removable filesystem (read-only/boot media hidden)",
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
} {
if !strings.Contains(view, want) {
@@ -537,6 +607,32 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
}
}
func TestExportTargetsMsgEmptyShowsHiddenBootMediaHint(t *testing.T) {
t.Parallel()
m := newTestModel()
m.busy = true
m.busyTitle = "Export support bundle"
next, _ := m.Update(exportTargetsMsg{})
got := next.(model)
if got.screen != screenOutput {
t.Fatalf("screen=%q want %q", got.screen, screenOutput)
}
if got.title != "Export support bundle" {
t.Fatalf("title=%q want %q", got.title, "Export support bundle")
}
for _, want := range []string{
"No writable removable filesystems found.",
"Read-only or boot media are hidden from this list.",
} {
if !strings.Contains(got.body, want) {
t.Fatalf("body missing %q\nbody:\n%s", want, got.body)
}
}
}
func TestViewStaticFormRendersFields(t *testing.T) {
t.Parallel()

View File

@@ -16,6 +16,7 @@ type screen string
const (
screenMain screen = "main"
screenHealthCheck screen = "health_check"
screenBurnInTests screen = "burn_in_tests"
screenSettings screen = "settings"
screenNetwork screen = "network"
screenInterfacePick screen = "interface_pick"
@@ -41,8 +42,8 @@ const (
actionRunMemorySAT actionKind = "run_memory_sat"
actionRunStorageSAT actionKind = "run_storage_sat"
actionRunCPUSAT actionKind = "run_cpu_sat"
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
actionRunFanStress actionKind = "run_fan_stress"
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
actionRunFanStress actionKind = "run_fan_stress"
)
type model struct {
@@ -84,6 +85,11 @@ type model struct {
hcCursor int
hcInitialized bool
// Burn-in tests screen
burnMode int
burnCursor int
burnInitialized bool
// NVIDIA SAT setup
nvidiaGPUs []platform.NvidiaGPU
nvidiaGPUSel []bool
@@ -97,6 +103,9 @@ type model struct {
// GPU Platform Stress Test running
gpuStressCancel func()
gpuStressAborted bool
gpuLiveRows []platform.GPUMetricRow
gpuLiveIndices []int
gpuLiveStart time.Time
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
progressLines []string
@@ -129,6 +138,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
screen: screenMain,
mainMenu: []string{
"Health Check",
"Burn-in tests",
"Export support bundle",
"Settings",
"Exit",
@@ -198,7 +208,7 @@ func (m model) confirmBody() (string, string) {
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
"Monitors fans, temps, power — detects throttling.\n" +
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
"Mode: " + modes[m.burnMode] + "\n\nAll NVIDIA GPUs will be stressed."
default:
return "Confirm", "Proceed?"
}

View File

@@ -3,6 +3,7 @@ package tui
import (
"fmt"
"strings"
"time"
tea "github.com/charmbracelet/bubbletea"
)
@@ -100,6 +101,13 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.screen = screenOutput
return m, m.refreshSnapshotCmd()
}
if len(msg.targets) == 0 {
m.title = "Export support bundle"
m.body = "No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list."
m.prevScreen = screenMain
m.screen = screenOutput
return m, m.refreshSnapshotCmd()
}
m.targets = msg.targets
m.screen = screenExportTargets
m.cursor = 0
@@ -116,7 +124,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.gpuStressCancel()
m.gpuStressCancel = nil
}
m.prevScreen = screenHealthCheck
m.prevScreen = screenBurnInTests
m.screen = screenOutput
m.title = msg.title
if msg.err != nil {
@@ -130,6 +138,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.body = msg.body
}
return m, m.refreshSnapshotCmd()
case gpuLiveTickMsg:
if m.screen == screenGPUStressRunning {
if len(msg.rows) > 0 {
elapsed := time.Since(m.gpuLiveStart).Seconds()
for i := range msg.rows {
msg.rows[i].ElapsedSec = elapsed
}
m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
n := max(1, len(msg.indices))
if len(m.gpuLiveRows) > 60*n {
m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
}
}
return m, pollGPULive(msg.indices)
}
return m, nil
case nvidiaSATDoneMsg:
if m.nvidiaSATAborted {
return m, nil
@@ -162,6 +186,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.updateMain(msg)
case screenHealthCheck:
return m.updateHealthCheck(msg)
case screenBurnInTests:
return m.updateBurnInTests(msg)
case screenSettings:
return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
case screenNetwork:

View File

@@ -57,6 +57,8 @@ func (m model) View() string {
body = renderTwoColumnMain(m)
case screenHealthCheck:
body = renderHealthCheck(m)
case screenBurnInTests:
body = renderBurnInTests(m)
case screenSettings:
body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
case screenNetwork:
@@ -66,7 +68,12 @@ func (m model) View() string {
case screenServiceAction:
body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
case screenExportTargets:
body = renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
body = renderMenu(
"Export support bundle",
"Select writable removable filesystem (read-only/boot media hidden)",
renderTargetItems(m.targets),
m.cursor,
)
case screenInterfacePick:
body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
case screenStaticForm:
@@ -79,7 +86,7 @@ func (m model) View() string {
case screenNvidiaSATRunning:
body = renderNvidiaSATRunning()
case screenGPUStressRunning:
body = renderGPUStressRunning()
body = renderGPUStressRunning(m)
case screenOutput:
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
default:

View File

@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.
## Boot sequence (single ISO)
The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
`systemd` boot order:
```
@@ -25,6 +27,7 @@ local-fs.target
```
**Critical invariants:**
- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
@@ -71,24 +74,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
d. build kernel modules against Debian headers
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
f. cache in `dist/nvidia-<version>-<kver>/`
7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/`
10. write staged `/etc/bee-release` (versions + git commit)
11. patch staged `motd` with build metadata
12. copy `iso/builder/` into a temporary live-build workdir under `dist/`
13. sync staged overlay into workdir `config/includes.chroot/`
14. run `lb config && lb build` inside the privileged builder container
7. `build-cublas.sh`:
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
b. verify packages against repo `Packages.gz`
c. extract headers for `bee-gpu-stress` build
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
12. write staged `/etc/bee-release` (versions + git commit)
13. patch staged `motd` with build metadata
14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
15. sync staged overlay into workdir `config/includes.chroot/`
16. run `lb config && lb build` inside the privileged builder container
```
Build host notes:
- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
**Critical invariants:**
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
2. `auto/config``linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
## Post-boot smoke test
@@ -131,10 +149,15 @@ Current validation state:
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
Acceptance flows:
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
- `bee sat memory``memtester` archive
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
- Ada / Hopper: add `fp8`
- Blackwell+: add `fp4`
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
- Runtime overrides:
- `BEE_GPU_STRESS_SECONDS`
- `BEE_GPU_STRESS_SIZE_MB`

View File

@@ -21,7 +21,8 @@ Fills gaps where Redfish/logpile is blind:
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
- Machine-readable health summary derived from collector verdicts
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
- Automatic boot audit with operator-facing local console and SSH access
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
- SSH access (OpenSSH) always available for inspection and debugging
@@ -69,6 +70,7 @@ Fills gaps where Redfish/logpile is blind:
| SSH | OpenSSH server |
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
| Builder | Debian 12 host/VM or Debian 12 container image |
## Operator UX
@@ -78,6 +80,7 @@ Fills gaps where Redfish/logpile is blind:
- The TUI itself executes privileged actions as `root` via `sudo -n`
- SSH remains available independently of the local console path
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
## Runtime split
@@ -85,6 +88,7 @@ Fills gaps where Redfish/logpile is blind:
- Live-ISO-only responsibilities stay in `iso/` integration code
- Live ISO launches the Go CLI with `--runtime livecd`
- Local/manual runs use `--runtime auto` or `--runtime local`
- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
## Key paths

58
iso/README.md Normal file
View File

@@ -0,0 +1,58 @@
# ISO Build
`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
## Requirements
- Docker Desktop or another Docker-compatible container runtime
- Privileged containers enabled
- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
## Build On macOS
From the repository root:
```sh
sh iso/builder/build-in-container.sh
```
The script defaults to `linux/amd64` builder containers, so it works on:
- Intel Mac
- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
You do not need to pass `--platform` manually for normal ISO builds.
## Useful Options
Build with explicit SSH keys baked into the ISO:
```sh
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
```
Rebuild the builder image:
```sh
sh iso/builder/build-in-container.sh --rebuild-image
```
Use a custom cache directory:
```sh
sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
```
## Notes
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
- Override the container platform only if you know why:
```sh
BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
```
- The shipped ISO is still `amd64`.
- Output ISO artifacts are written under `dist/`.

View File

@@ -4,5 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
NCCL_VERSION=2.28.9-1
NCCL_CUDA_VERSION=13.0
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
CUBLAS_VERSION=13.0.2.14-1
CUDA_USERSPACE_VERSION=13.0.96-1
GO_VERSION=1.24.0
AUDIT_VERSION=1.0.0

View File

@@ -32,6 +32,6 @@ lb config noauto \
--memtest none \
--iso-volume "EASY-BEE" \
--iso-application "EASY-BEE" \
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--bootappend-live "boot=live toram components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--apt-recommends false \
"${@}"

File diff suppressed because it is too large Load Diff

170
iso/builder/build-cublas.sh Normal file
View File

@@ -0,0 +1,170 @@
#!/bin/sh
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
#
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
# verifies them against Packages.gz, and extracts the small subset we need:
# - headers for compiling bee-gpu-stress against cuBLASLt
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
set -e
CUBLAS_VERSION="$1"
CUDA_USERSPACE_VERSION="$2"
CUDA_SERIES="$3"
DIST_DIR="$4"
[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$CUDA_SERIES" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
&& [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
echo "=== cuBLAS cached, skipping download ==="
echo "cache: $CACHE_DIR"
exit 0
fi
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
echo "=== downloading Packages.gz ==="
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
lookup_pkg() {
pkg="$1"
ver="$2"
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
/^Package: / { cur_pkg=$2 }
/^Version: / { cur_ver=$2 }
/^Filename: / { cur_file=$2 }
/^SHA256: / { cur_sha=$2 }
/^$/ {
if (cur_pkg == pkg && cur_ver == ver) {
print cur_file " " cur_sha
exit
}
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
}
END {
if (cur_pkg == pkg && cur_ver == ver) {
print cur_file " " cur_sha
}
}'
}
download_verified_pkg() {
pkg="$1"
ver="$2"
meta="$(lookup_pkg "$pkg" "$ver")"
[ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
[ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
[ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
if [ -f "$out" ]; then
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
if [ "$actual_sha" = "$repo_sha" ]; then
echo "=== using cached $(basename "$repo_file") ==="
printf '%s\n' "$out"
return 0
fi
echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ==="
rm -f "$out"
fi
echo "=== downloading $(basename "$repo_file") ==="
wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
if [ "$actual_sha" != "$repo_sha" ]; then
echo "ERROR: sha256 mismatch for $(basename "$repo_file")"
echo " expected: $repo_sha"
echo " actual: $actual_sha"
rm -f "$out"
exit 1
fi
echo "sha256 OK: $(basename "$repo_file")"
printf '%s\n' "$out"
}
extract_deb() {
deb="$1"
dst="$2"
mkdir -p "$dst"
(
cd "$dst"
ar x "$deb"
data_tar=$(ls data.tar.* 2>/dev/null | head -1)
[ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
tar xf "$data_tar"
)
}
copy_headers() {
from="$1"
if [ -d "${from}/usr/include" ]; then
cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
fi
}
copy_libs() {
from="$1"
find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
\( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
}
make_links() {
base="$1"
versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
[ -n "$versioned" ] || return 0
soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
target=$(basename "$versioned")
ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
}
TMP_DIR=$(mktemp -d)
trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
copy_headers "${TMP_DIR}/cublas-dev"
copy_headers "${TMP_DIR}/cudart-dev"
copy_libs "${TMP_DIR}/cublas-rt"
copy_libs "${TMP_DIR}/cudart-rt"
make_links "libcublas"
make_links "libcublasLt"
make_links "libcudart"
[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
echo "=== cuBLAS extraction complete ==="
echo "cache: $CACHE_DIR"
echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"

View File

@@ -7,6 +7,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
BUILDER_DIR="${REPO_ROOT}/iso/builder"
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
AUTH_KEYS=""
REBUILD_IMAGE=0
@@ -40,6 +41,13 @@ if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
exit 1
fi
PLATFORM_OS="${BUILDER_PLATFORM%/*}"
PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
exit 1
fi
if [ -n "$AUTH_KEYS" ]; then
[ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
@@ -56,17 +64,35 @@ mkdir -p \
IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
image_matches_platform() {
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
[ "$actual_platform" = "${BUILDER_PLATFORM}" ]
}
NEED_BUILD_IMAGE=0
if [ "$REBUILD_IMAGE" = "1" ]; then
NEED_BUILD_IMAGE=1
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
NEED_BUILD_IMAGE=1
elif ! image_matches_platform; then
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
NEED_BUILD_IMAGE=1
fi
if [ "$NEED_BUILD_IMAGE" = "1" ]; then
"$CONTAINER_TOOL" build \
--platform "${BUILDER_PLATFORM}" \
--build-arg GO_VERSION="${GO_VERSION}" \
-t "${IMAGE_REF}" \
"${BUILDER_DIR}"
else
echo "=== using existing builder image ${IMAGE_REF} ==="
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
fi
set -- \
run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \
-e BEE_CONTAINER_BUILD=1 \
@@ -80,6 +106,7 @@ set -- \
if [ -n "$AUTH_KEYS" ]; then
set -- run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \

View File

@@ -159,6 +159,16 @@ else
echo "=== bee binary up to date, skipping build ==="
fi
echo ""
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
sh "${BUILDER_DIR}/build-cublas.sh" \
"${CUBLAS_VERSION}" \
"${CUDA_USERSPACE_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
GPU_STRESS_NEED_BUILD=0
@@ -167,6 +177,7 @@ fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
echo "=== building bee-gpu-stress ==="
gcc -O2 -s -Wall -Wextra \
-I"${CUBLAS_CACHE}/include" \
-o "$GPU_STRESS_BIN" \
"${BUILDER_DIR}/bee-gpu-stress.c" \
-ldl
@@ -283,6 +294,10 @@ NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# --- embed build metadata ---
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
BUILD_DATE="$(date +%Y-%m-%d)"
@@ -297,6 +312,8 @@ DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
NCCL_VERSION=${NCCL_VERSION}
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
CUBLAS_VERSION=${CUBLAS_VERSION}
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
EOF
# Patch motd with build info

View File

@@ -20,6 +20,7 @@ openssh-server
# Filesystem support for USB export targets
exfatprogs
exfat-fuse
ntfs-3g
# Utilities

View File

@@ -17,4 +17,5 @@ if [ -z "${SSH_CONNECTION:-}" ] \
&& [ "$(tty 2>/dev/null)" = "/dev/tty1" ]; then
echo "Bee live environment ready."
echo "Run 'menu' to open the TUI."
echo "Kernel logs: Alt+F2 | Extra shell: Alt+F3"
fi

View File

@@ -1,4 +1,4 @@
[Journal]
ForwardToConsole=yes
TTYPath=/dev/ttyS0
MaxLevelConsole=debug
MaxLevelConsole=info