Compare commits

..

4 Commits

Author SHA1 Message Date
Mikhail Chusavitin
fc5c2019aa iso: improve burn-in, export, and live boot 2026-03-26 18:56:19 +03:00
Mikhail Chusavitin
67a215c66f fix(iso): route kernel logs to tty2, keep tty1 clean for TUI
console=tty0 sent kernel messages to the active VT (tty1), overwriting
the TUI. Changed to console=tty2 so kernel logs land on a dedicated
console. tty1 is now clean; operator can press Alt+F2 to inspect kernel
messages and Alt+F3 for an extra shell.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 17:40:44 +03:00
Mikhail Chusavitin
8b4bfdf5ad feat(tui): live GPU chart during stress test, full VRAM allocation
- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop.
  nvidia-smi is polled every second; up to 60 data points per GPU kept.
  All three metrics (Usage %, Temp °C, Power W) drawn on a single plot,
  each normalised to its own range and rendered in a different colour.
- Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM
  minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 17:37:20 +03:00
Mikhail Chusavitin
0a52a4f3ba fix(iso): restore loglevel=7 on VGA console for crash visibility
loglevel=3 was hiding all kernel messages on tty0/ttyS0 except errors.
Machine crashes (panics, driver oops, module failures) were silent on VGA.

Restored loglevel=7 so kernel messages up to debug are printed to both
tty0 (VGA) and ttyS0 (SOL). Journald MaxLevelConsole reduced to info
(was debug) to reduce noise on SOL while keeping it useful.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 11:19:07 +03:00
27 changed files with 1928 additions and 193 deletions

View File

@@ -231,8 +231,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) { func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportLatestAudit(target) path, err := a.ExportLatestAudit(target)
body := "Audit exported." body := "Audit export failed."
if path != "" { if err == nil {
body = "Audit exported."
}
if err == nil && path != "" {
body = "Audit exported to " + path body = "Audit exported to " + path
} }
return ActionResult{Title: "Export audit", Body: body}, err return ActionResult{Title: "Export audit", Body: body}, err
@@ -249,8 +252,11 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) { func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportSupportBundle(target) path, err := a.ExportSupportBundle(target)
body := "Support bundle exported. USB target unmounted and safe to remove." body := "Support bundle export failed."
if path != "" { if err == nil {
body = "Support bundle exported. USB target unmounted and safe to remove."
}
if err == nil && path != "" {
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove." body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
} }
return ActionResult{Title: "Export support bundle", Body: body}, err return ActionResult{Title: "Export support bundle", Body: body}, err

View File

@@ -470,6 +470,41 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
} }
} }
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
oldExportDir := DefaultExportDir
DefaultExportDir = tmp
t.Cleanup(func() { DefaultExportDir = oldExportDir })
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
t.Fatalf("write bee-audit.json: %v", err)
}
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
t.Fatalf("write bee-audit.log: %v", err)
}
a := &App{
exports: fakeExports{
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
},
},
}
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
if err == nil {
t.Fatal("expected export error")
}
if contains(result.Body, "exported to") {
t.Fatalf("body should not claim success:\n%s", result.Body)
}
if result.Body != "Support bundle export failed." {
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
}
}
func TestRunNvidiaAcceptancePackResult(t *testing.T) { func TestRunNvidiaAcceptancePackResult(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -11,8 +11,48 @@ import (
var exportExecCommand = exec.Command var exportExecCommand = exec.Command
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
msg := strings.TrimSpace(raw)
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
}
if msg == "" {
return err
}
return fmt.Errorf("%s: %w", msg, err)
}
func removableTargetReadOnly(fields map[string]string) bool {
if fields["RO"] == "1" {
return true
}
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
case "iso9660", "squashfs":
return true
default:
return false
}
}
func ensureWritableMountpoint(mountpoint string) error {
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
if err != nil {
return fmt.Errorf("target filesystem is not writable: %w", err)
}
name := probe.Name()
if closeErr := probe.Close(); closeErr != nil {
_ = os.Remove(name)
return closeErr
}
if err := os.Remove(name); err != nil {
return err
}
return nil
}
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) { func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output() raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -36,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
} }
} }
} }
if !removable || fields["FSTYPE"] == "" { if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
continue continue
} }
@@ -72,7 +112,7 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
} }
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil { if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
_ = os.Remove(mountpoint) _ = os.Remove(mountpoint)
return string(raw), err return "", formatMountTargetError(target, string(raw), err)
} }
mountedHere = true mountedHere = true
mounted = true mounted = true
@@ -95,6 +135,10 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
} }
}() }()
if err := ensureWritableMountpoint(mountpoint); err != nil {
return "", err
}
filename := filepath.Base(src) filename := filepath.Base(src)
dst = filepath.Join(mountpoint, filename) dst = filepath.Join(mountpoint, filename)
data, err := os.ReadFile(src) data, err := os.ReadFile(src)

View File

@@ -4,12 +4,11 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"strings"
"testing" "testing"
) )
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) { func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Parallel()
tmp := t.TempDir() tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz") src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt") mountpoint := filepath.Join(tmp, "mnt")
@@ -54,3 +53,60 @@ func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls) t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
} }
} }
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
tmp := t.TempDir()
src := filepath.Join(tmp, "bundle.tar.gz")
mountpoint := filepath.Join(tmp, "mnt")
if err := os.MkdirAll(mountpoint, 0755); err != nil {
t.Fatalf("mkdir mountpoint: %v", err)
}
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
t.Fatalf("write src: %v", err)
}
if err := os.Chmod(mountpoint, 0555); err != nil {
t.Fatalf("chmod mountpoint: %v", err)
}
oldExec := exportExecCommand
exportExecCommand = func(name string, args ...string) *exec.Cmd {
return exec.Command("sh", "-c", "exit 0")
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
_, err := s.ExportFileToTarget(src, RemovableTarget{
Device: "/dev/sdb1",
Mountpoint: mountpoint,
})
if err == nil {
t.Fatal("expected error for non-writable mountpoint")
}
if !strings.Contains(err.Error(), "target filesystem is not writable") {
t.Fatalf("err=%q want writable message", err)
}
}
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
oldExec := exportExecCommand
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
exportExecCommand = func(name string, args ...string) *exec.Cmd {
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
return cmd
}
t.Cleanup(func() { exportExecCommand = oldExec })
s := &System{}
targets, err := s.ListRemovableTargets()
if err != nil {
t.Fatalf("ListRemovableTargets error: %v", err)
}
if len(targets) != 1 {
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
}
if got := targets[0].Device; got != "/dev/sdb1" {
t.Fatalf("device=%q want /dev/sdb1", got)
}
}

View File

@@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 {
return v return v
} }
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
return sampleGPUMetrics(gpuIndices)
}
// WriteGPUMetricsCSV writes collected rows as a CSV file. // WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer var b bytes.Buffer
@@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
return strings.TrimRight(b.String(), "\n") return strings.TrimRight(b.String(), "\n")
} }
// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
// Each series is normalised to its own minmax and drawn in a different colour.
// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
if chartWidth < 20 {
chartWidth = 70
}
const chartHeight = 14
seen := make(map[int]bool)
var order []int
gpuMap := make(map[int][]GPUMetricRow)
for _, r := range rows {
if !seen[r.GPUIndex] {
seen[r.GPUIndex] = true
order = append(order, r.GPUIndex)
}
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
}
type seriesDef struct {
label string
color string
unit string
fn func(GPUMetricRow) float64
}
defs := []seriesDef{
{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
}
var b strings.Builder
for _, gpuIdx := range order {
gr := gpuMap[gpuIdx]
if len(gr) == 0 {
continue
}
elapsed := gr[len(gr)-1].ElapsedSec
// Build value slices for each series.
type seriesData struct {
seriesDef
vals []float64
mn float64
mx float64
}
var series []seriesData
for _, d := range defs {
vals := extractGPUField(gr, d.fn)
mn, mx := gpuMinMax(vals)
if mn == mx {
mx = mn + 1
}
series = append(series, seriesData{d, vals, mn, mx})
}
// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
type cell struct {
ch rune
color string
}
grid := make([][]cell, chartHeight+1)
for r := range grid {
grid[r] = make([]cell, chartWidth)
for c := range grid[r] {
grid[r][c] = cell{' ', ""}
}
}
// Plot each series onto the shared grid.
for _, s := range series {
w := chartWidth
if len(s.vals) < w {
w = len(s.vals)
}
data := gpuDownsample(s.vals, w)
prevRow := -1
for x, v := range data {
row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
if row < 0 {
row = 0
}
if row > chartHeight {
row = chartHeight
}
if prevRow < 0 || prevRow == row {
grid[row][x] = cell{'─', s.color}
} else {
lo, hi := prevRow, row
if lo > hi {
lo, hi = hi, lo
}
for y := lo + 1; y < hi; y++ {
grid[y][x] = cell{'│', s.color}
}
if prevRow < row {
grid[prevRow][x] = cell{'╮', s.color}
grid[row][x] = cell{'╰', s.color}
} else {
grid[prevRow][x] = cell{'╯', s.color}
grid[row][x] = cell{'╭', s.color}
}
}
prevRow = row
}
}
// Render: Y axis + data rows.
fmt.Fprintf(&b, "GPU %d (%.0fs) each series normalised to its range\n", gpuIdx, elapsed)
for r := 0; r <= chartHeight; r++ {
// Y axis label: 100% at top, 50% in middle, 0% at bottom.
switch r {
case 0:
fmt.Fprintf(&b, "%4s┤", "100%")
case chartHeight / 2:
fmt.Fprintf(&b, "%4s┤", "50%")
case chartHeight:
fmt.Fprintf(&b, "%4s┤", "0%")
default:
fmt.Fprintf(&b, "%4s│", "")
}
for c := 0; c < chartWidth; c++ {
cl := grid[r][c]
if cl.color != "" {
b.WriteString(cl.color)
b.WriteRune(cl.ch)
b.WriteString(ansiReset)
} else {
b.WriteRune(' ')
}
}
b.WriteRune('\n')
}
// Bottom axis.
b.WriteString(" └")
b.WriteString(strings.Repeat("─", chartWidth))
b.WriteRune('\n')
// Legend with current (last) values.
b.WriteString(" ")
for i, s := range series {
last := s.vals[len(s.vals)-1]
b.WriteString(s.color)
fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
b.WriteString(ansiReset)
if i < len(series)-1 {
b.WriteString(" ")
}
}
b.WriteRune('\n')
}
return strings.TrimRight(b.String(), "\n")
}
// renderLineChart draws a single time-series line chart using box-drawing characters. // renderLineChart draws a single time-series line chart using box-drawing characters.
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption. // Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
func renderLineChart(vals []float64, color, caption string, height, width int) string { func renderLineChart(vals []float64, color, caption string, height, width int) string {

View File

@@ -151,8 +151,10 @@ func (m model) confirmCancelTarget() screen {
switch m.pendingAction { switch m.pendingAction {
case actionExportBundle: case actionExportBundle:
return screenExportTargets return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress: case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
return screenHealthCheck return screenHealthCheck
case actionRunFanStress:
return screenBurnInTests
default: default:
return screenMain return screenMain
} }
@@ -182,12 +184,13 @@ func hcFanStressOpts(hcMode int, application interface {
} }
} }
// Use minimum GPU memory size to fit all GPUs. // Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
sizeMB := 64 sizeMB := 64
if gpus, err := application.ListNvidiaGPUs(); err == nil { if gpus, err := application.ListNvidiaGPUs(); err == nil {
for _, g := range gpus { for _, g := range gpus {
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) { free := g.MemoryMB - 512
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU if free > 0 && (sizeMB == 64 || free < sizeMB) {
sizeMB = free
} }
} }
} }

View File

@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
body string body string
err error err error
} }
type gpuLiveTickMsg struct {
rows []platform.GPUMetricRow
indices []int
}

View File

@@ -0,0 +1,117 @@
package tui
import (
"fmt"
"strings"
tea "github.com/charmbracelet/bubbletea"
)
const (
burnCurGPUStress = 0
burnCurModeQuick = 1
burnCurModeStd = 2
burnCurModeExpr = 3
burnCurRun = 4
burnCurTotal = 5
)
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
m.screen = screenBurnInTests
m.cursor = 0
if !m.burnInitialized {
m.burnMode = 0
m.burnCursor = 0
m.burnInitialized = true
}
return m, nil
}
func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
switch msg.String() {
case "up", "k":
if m.burnCursor > 0 {
m.burnCursor--
}
case "down", "j":
if m.burnCursor < burnCurTotal-1 {
m.burnCursor++
}
case " ":
switch m.burnCursor {
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
m.burnMode = m.burnCursor - burnCurModeQuick
}
case "enter":
switch m.burnCursor {
case burnCurGPUStress, burnCurRun:
return m.burnRunSelected()
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
m.burnMode = m.burnCursor - burnCurModeQuick
}
case "f", "F", "r", "R":
return m.burnRunSelected()
case "1":
m.burnMode = 0
case "2":
m.burnMode = 1
case "3":
m.burnMode = 2
case "esc":
m.screen = screenMain
m.cursor = 1
case "q", "ctrl+c":
return m, tea.Quit
}
return m, nil
}
func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
return m.hcRunFanStress()
}
func renderBurnInTests(m model) string {
var b strings.Builder
fmt.Fprintln(&b, "BURN-IN TESTS")
fmt.Fprintln(&b)
fmt.Fprintln(&b, " Stress tests:")
fmt.Fprintln(&b)
pfx := " "
if m.burnCursor == burnCurGPUStress {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
fmt.Fprintln(&b)
fmt.Fprintln(&b, " Mode:")
modes := []struct{ label, key string }{
{"Quick", "1"},
{"Standard", "2"},
{"Express", "3"},
}
for i, mode := range modes {
pfx := " "
if m.burnCursor == burnCurModeQuick+i {
pfx = "> "
}
radio := "( )"
if m.burnMode == i {
radio = "(*)"
}
fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key)
}
fmt.Fprintln(&b)
pfx = " "
if m.burnCursor == burnCurRun {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back")
return b.String()
}

View File

@@ -4,7 +4,12 @@ import tea "github.com/charmbracelet/bubbletea"
func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) { func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
if len(m.targets) == 0 { if len(m.targets) == 0 {
return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain) return m, resultCmd(
"Export support bundle",
"No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list.",
nil,
screenMain,
)
} }
target := m.targets[m.cursor] target := m.targets[m.cursor]
m.selectedTarget = &target m.selectedTarget = &target

View File

@@ -3,8 +3,10 @@ package tui
import ( import (
"context" "context"
"fmt" "fmt"
"os/exec"
"strings" "strings"
"time"
"bee/audit/internal/platform"
tea "github.com/charmbracelet/bubbletea" tea "github.com/charmbracelet/bubbletea"
) )
@@ -28,8 +30,7 @@ const (
hcCurModeStd = 6 hcCurModeStd = 6
hcCurModeExpr = 7 hcCurModeExpr = 7
hcCurRunAll = 8 hcCurRunAll = 8
hcCurFanStress = 9 hcCurTotal = 9
hcCurTotal = 10
) )
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds. // hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
@@ -84,8 +85,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
m.hcMode = m.hcCursor - hcCurModeQuick m.hcMode = m.hcCursor - hcCurModeQuick
case hcCurRunAll: case hcCurRunAll:
return m.hcRunAll() return m.hcRunAll()
case hcCurFanStress:
return m.hcRunFanStress()
} }
case "g", "G": case "g", "G":
return m.hcRunSingle(hcGPU) return m.hcRunSingle(hcGPU)
@@ -97,8 +96,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.hcRunSingle(hcCPU) return m.hcRunSingle(hcCPU)
case "r", "R": case "r", "R":
return m.hcRunAll() return m.hcRunAll()
case "f", "F":
return m.hcRunFanStress()
case "a", "A": case "a", "A":
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3] allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
for i := range m.hcSel { for i := range m.hcSel {
@@ -156,14 +153,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
return m, nil return m, nil
} }
// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently. // startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
// nvtop occupies the full terminal as a live chart; the stress test runs in background.
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) { func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
opts := hcFanStressOpts(m.hcMode, m.app) opts := hcFanStressOpts(m.burnMode, m.app)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
m.gpuStressCancel = cancel m.gpuStressCancel = cancel
m.gpuStressAborted = false m.gpuStressAborted = false
m.gpuLiveRows = nil
m.gpuLiveIndices = opts.GPUIndices
m.gpuLiveStart = time.Now()
m.screen = screenGPUStressRunning m.screen = screenGPUStressRunning
m.nvidiaSATCursor = 0 m.nvidiaSATCursor = 0
@@ -172,37 +171,29 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err} return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
} }
nvtopPath, lookErr := exec.LookPath("nvtop") return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
if lookErr != nil { }
return m, stressCmd
}
return m, tea.Batch( // pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
stressCmd, // The update handler reschedules it to achieve continuous 1s polling.
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg { func pollGPULive(indices []int) tea.Cmd {
return nvtopClosedMsg{} return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
}), rows, _ := platform.SampleGPUMetrics(indices)
) return gpuLiveTickMsg{rows: rows, indices: indices}
})
} }
// updateGPUStressRunning handles keys on the GPU stress running screen. // updateGPUStressRunning handles keys on the GPU stress running screen.
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) { func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
switch msg.String() { switch msg.String() {
case "o", "O":
nvtopPath, err := exec.LookPath("nvtop")
if err != nil {
return m, nil
}
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
return nvtopClosedMsg{}
})
case "a", "A": case "a", "A":
if m.gpuStressCancel != nil { if m.gpuStressCancel != nil {
m.gpuStressCancel() m.gpuStressCancel()
m.gpuStressCancel = nil m.gpuStressCancel = nil
} }
m.gpuStressAborted = true m.gpuStressAborted = true
m.screen = screenHealthCheck m.screen = screenBurnInTests
m.burnCursor = burnCurGPUStress
m.cursor = 0 m.cursor = 0
case "ctrl+c": case "ctrl+c":
return m, tea.Quit return m, tea.Quit
@@ -210,8 +201,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m, nil return m, nil
} }
func renderGPUStressRunning() string { func renderGPUStressRunning(m model) string {
return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n" var b strings.Builder
fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
fmt.Fprintln(&b)
if len(m.gpuLiveRows) == 0 {
fmt.Fprintln(&b, "Collecting metrics...")
} else {
chartWidth := m.width - 8
if chartWidth < 40 {
chartWidth = 70
}
b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
}
fmt.Fprintln(&b)
b.WriteString("[a] Abort test [ctrl+c] quit")
return b.String()
} }
func (m model) hcRunAll() (tea.Model, tea.Cmd) { func (m model) hcRunAll() (tea.Model, tea.Cmd) {
@@ -371,16 +376,8 @@ func renderHealthCheck(m model) string {
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx) fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
} }
{
pfx := " "
if m.hcCursor == hcCurFanStress {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
}
fmt.Fprintln(&b) fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────") fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] gpu stress [Esc] back") fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
return b.String() return b.String()
} }

View File

@@ -8,7 +8,9 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
switch m.cursor { switch m.cursor {
case 0: // Health Check case 0: // Health Check
return m.enterHealthCheck() return m.enterHealthCheck()
case 1: // Export support bundle case 1: // Burn-in tests
return m.enterBurnInTests()
case 2: // Export support bundle
m.pendingAction = actionExportBundle m.pendingAction = actionExportBundle
m.busy = true m.busy = true
m.busyTitle = "Export support bundle" m.busyTitle = "Export support bundle"
@@ -16,11 +18,11 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
targets, err := m.app.ListRemovableTargets() targets, err := m.app.ListRemovableTargets()
return exportTargetsMsg{targets: targets, err: err} return exportTargetsMsg{targets: targets, err: err}
} }
case 2: // Settings case 3: // Settings
m.screen = screenSettings m.screen = screenSettings
m.cursor = 0 m.cursor = 0
return m, nil return m, nil
case 3: // Exit case 4: // Exit
return m, tea.Quit return m, tea.Quit
} }
return m, nil return m, nil

View File

@@ -54,9 +54,10 @@ func TestUpdateMainMenuEnterActions(t *testing.T) {
wantCmd bool wantCmd bool
}{ }{
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true}, {name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true},
{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true}, {name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests, wantCmd: true},
{name: "settings", cursor: 2, wantScreen: screenSettings, wantCmd: true}, {name: "export", cursor: 2, wantScreen: screenMain, wantBusy: true, wantCmd: true},
{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true}, {name: "settings", cursor: 3, wantScreen: screenSettings, wantCmd: true},
{name: "exit", cursor: 4, wantScreen: screenMain, wantCmd: true},
} }
for _, test := range tests { for _, test := range tests {
@@ -115,7 +116,8 @@ func TestMainMenuSimpleTransitions(t *testing.T) {
wantScreen screen wantScreen screen
}{ }{
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck}, {name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
{name: "settings", cursor: 2, wantScreen: screenSettings}, {name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests},
{name: "settings", cursor: 3, wantScreen: screenSettings},
} }
for _, test := range tests { for _, test := range tests {
@@ -146,7 +148,7 @@ func TestMainMenuExportSetsBusy(t *testing.T) {
t.Parallel() t.Parallel()
m := newTestModel() m := newTestModel()
m.cursor = 1 // Export support bundle m.cursor = 2 // Export support bundle
next, cmd := m.handleMainMenu() next, cmd := m.handleMainMenu()
got := next.(model) got := next.(model)
@@ -163,12 +165,13 @@ func TestMainViewRendersTwoColumns(t *testing.T) {
t.Parallel() t.Parallel()
m := newTestModel() m := newTestModel()
m.cursor = 1 m.cursor = 2
view := m.View() view := m.View()
for _, want := range []string{ for _, want := range []string{
"bee", "bee",
"Health Check", "Health Check",
"Burn-in tests",
"> Export support bundle", "> Export support bundle",
"Settings", "Settings",
"Exit", "Exit",
@@ -400,6 +403,11 @@ func TestConfirmCancelTarget(t *testing.T) {
t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck) t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
} }
m.pendingAction = actionRunFanStress
if got := m.confirmCancelTarget(); got != screenBurnInTests {
t.Fatalf("fan stress cancel target=%q want %q", got, screenBurnInTests)
}
m.pendingAction = actionNone m.pendingAction = actionNone
if got := m.confirmCancelTarget(); got != screenMain { if got := m.confirmCancelTarget(); got != screenMain {
t.Fatalf("default cancel target=%q want %q", got, screenMain) t.Fatalf("default cancel target=%q want %q", got, screenMain)
@@ -439,6 +447,68 @@ func TestViewBusyStateUsesBusyTitle(t *testing.T) {
} }
} }
func TestBurnInTestsEscReturnsToMain(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
m.burnCursor = 3
next, _ := m.updateBurnInTests(tea.KeyMsg{Type: tea.KeyEsc})
got := next.(model)
if got.screen != screenMain {
t.Fatalf("screen=%q want %q", got.screen, screenMain)
}
if got.cursor != 1 {
t.Fatalf("cursor=%d want 1", got.cursor)
}
}
func TestBurnInTestsRunOpensConfirm(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
m.burnInitialized = true
m.burnMode = 2
next, _ := m.burnRunSelected()
got := next.(model)
if got.screen != screenConfirm {
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
}
if got.pendingAction != actionRunFanStress {
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunFanStress)
}
if got.cursor != 0 {
t.Fatalf("cursor=%d want 0", got.cursor)
}
}
func TestViewBurnInTestsRendersGPUStressEntry(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenBurnInTests
view := m.View()
for _, want := range []string{
"BURN-IN TESTS",
"GPU PLATFORM STRESS TEST",
"Quick",
"Standard",
"Express",
"[ RUN SELECTED [R] ]",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) { func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
t.Parallel() t.Parallel()
@@ -528,7 +598,7 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
for _, want := range []string{ for _, want := range []string{
"Export support bundle", "Export support bundle",
"Select removable filesystem", "Select writable removable filesystem (read-only/boot media hidden)",
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee", "> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
} { } {
if !strings.Contains(view, want) { if !strings.Contains(view, want) {
@@ -537,6 +607,32 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
} }
} }
func TestExportTargetsMsgEmptyShowsHiddenBootMediaHint(t *testing.T) {
t.Parallel()
m := newTestModel()
m.busy = true
m.busyTitle = "Export support bundle"
next, _ := m.Update(exportTargetsMsg{})
got := next.(model)
if got.screen != screenOutput {
t.Fatalf("screen=%q want %q", got.screen, screenOutput)
}
if got.title != "Export support bundle" {
t.Fatalf("title=%q want %q", got.title, "Export support bundle")
}
for _, want := range []string{
"No writable removable filesystems found.",
"Read-only or boot media are hidden from this list.",
} {
if !strings.Contains(got.body, want) {
t.Fatalf("body missing %q\nbody:\n%s", want, got.body)
}
}
}
func TestViewStaticFormRendersFields(t *testing.T) { func TestViewStaticFormRendersFields(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -16,6 +16,7 @@ type screen string
const ( const (
screenMain screen = "main" screenMain screen = "main"
screenHealthCheck screen = "health_check" screenHealthCheck screen = "health_check"
screenBurnInTests screen = "burn_in_tests"
screenSettings screen = "settings" screenSettings screen = "settings"
screenNetwork screen = "network" screenNetwork screen = "network"
screenInterfacePick screen = "interface_pick" screenInterfacePick screen = "interface_pick"
@@ -84,6 +85,11 @@ type model struct {
hcCursor int hcCursor int
hcInitialized bool hcInitialized bool
// Burn-in tests screen
burnMode int
burnCursor int
burnInitialized bool
// NVIDIA SAT setup // NVIDIA SAT setup
nvidiaGPUs []platform.NvidiaGPU nvidiaGPUs []platform.NvidiaGPU
nvidiaGPUSel []bool nvidiaGPUSel []bool
@@ -97,6 +103,9 @@ type model struct {
// GPU Platform Stress Test running // GPU Platform Stress Test running
gpuStressCancel func() gpuStressCancel func()
gpuStressAborted bool gpuStressAborted bool
gpuLiveRows []platform.GPUMetricRow
gpuLiveIndices []int
gpuLiveStart time.Time
// SAT verbose progress (CPU / Memory / Storage / AMD GPU) // SAT verbose progress (CPU / Memory / Storage / AMD GPU)
progressLines []string progressLines []string
@@ -129,6 +138,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
screen: screenMain, screen: screenMain,
mainMenu: []string{ mainMenu: []string{
"Health Check", "Health Check",
"Burn-in tests",
"Export support bundle", "Export support bundle",
"Settings", "Settings",
"Exit", "Exit",
@@ -198,7 +208,7 @@ func (m model) confirmBody() (string, string) {
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"} modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" + return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
"Monitors fans, temps, power — detects throttling.\n" + "Monitors fans, temps, power — detects throttling.\n" +
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed." "Mode: " + modes[m.burnMode] + "\n\nAll NVIDIA GPUs will be stressed."
default: default:
return "Confirm", "Proceed?" return "Confirm", "Proceed?"
} }

View File

@@ -3,6 +3,7 @@ package tui
import ( import (
"fmt" "fmt"
"strings" "strings"
"time"
tea "github.com/charmbracelet/bubbletea" tea "github.com/charmbracelet/bubbletea"
) )
@@ -100,6 +101,13 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.screen = screenOutput m.screen = screenOutput
return m, m.refreshSnapshotCmd() return m, m.refreshSnapshotCmd()
} }
if len(msg.targets) == 0 {
m.title = "Export support bundle"
m.body = "No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list."
m.prevScreen = screenMain
m.screen = screenOutput
return m, m.refreshSnapshotCmd()
}
m.targets = msg.targets m.targets = msg.targets
m.screen = screenExportTargets m.screen = screenExportTargets
m.cursor = 0 m.cursor = 0
@@ -116,7 +124,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.gpuStressCancel() m.gpuStressCancel()
m.gpuStressCancel = nil m.gpuStressCancel = nil
} }
m.prevScreen = screenHealthCheck m.prevScreen = screenBurnInTests
m.screen = screenOutput m.screen = screenOutput
m.title = msg.title m.title = msg.title
if msg.err != nil { if msg.err != nil {
@@ -130,6 +138,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.body = msg.body m.body = msg.body
} }
return m, m.refreshSnapshotCmd() return m, m.refreshSnapshotCmd()
case gpuLiveTickMsg:
if m.screen == screenGPUStressRunning {
if len(msg.rows) > 0 {
elapsed := time.Since(m.gpuLiveStart).Seconds()
for i := range msg.rows {
msg.rows[i].ElapsedSec = elapsed
}
m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
n := max(1, len(msg.indices))
if len(m.gpuLiveRows) > 60*n {
m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
}
}
return m, pollGPULive(msg.indices)
}
return m, nil
case nvidiaSATDoneMsg: case nvidiaSATDoneMsg:
if m.nvidiaSATAborted { if m.nvidiaSATAborted {
return m, nil return m, nil
@@ -162,6 +186,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.updateMain(msg) return m.updateMain(msg)
case screenHealthCheck: case screenHealthCheck:
return m.updateHealthCheck(msg) return m.updateHealthCheck(msg)
case screenBurnInTests:
return m.updateBurnInTests(msg)
case screenSettings: case screenSettings:
return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu) return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
case screenNetwork: case screenNetwork:

View File

@@ -57,6 +57,8 @@ func (m model) View() string {
body = renderTwoColumnMain(m) body = renderTwoColumnMain(m)
case screenHealthCheck: case screenHealthCheck:
body = renderHealthCheck(m) body = renderHealthCheck(m)
case screenBurnInTests:
body = renderBurnInTests(m)
case screenSettings: case screenSettings:
body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor) body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
case screenNetwork: case screenNetwork:
@@ -66,7 +68,12 @@ func (m model) View() string {
case screenServiceAction: case screenServiceAction:
body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor) body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
case screenExportTargets: case screenExportTargets:
body = renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor) body = renderMenu(
"Export support bundle",
"Select writable removable filesystem (read-only/boot media hidden)",
renderTargetItems(m.targets),
m.cursor,
)
case screenInterfacePick: case screenInterfacePick:
body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor) body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
case screenStaticForm: case screenStaticForm:
@@ -79,7 +86,7 @@ func (m model) View() string {
case screenNvidiaSATRunning: case screenNvidiaSATRunning:
body = renderNvidiaSATRunning() body = renderNvidiaSATRunning()
case screenGPUStressRunning: case screenGPUStressRunning:
body = renderGPUStressRunning() body = renderGPUStressRunning(m)
case screenOutput: case screenOutput:
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body)) body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
default: default:

View File

@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.
## Boot sequence (single ISO) ## Boot sequence (single ISO)
The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
`systemd` boot order: `systemd` boot order:
``` ```
@@ -25,6 +27,7 @@ local-fs.target
``` ```
**Critical invariants:** **Critical invariants:**
- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`. - OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking. - `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`. - `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
@@ -71,24 +74,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
d. build kernel modules against Debian headers d. build kernel modules against Debian headers
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
f. cache in `dist/nvidia-<version>-<kver>/` f. cache in `dist/nvidia-<version>-<kver>/`
7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/` 7. `build-cublas.sh`:
8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi` a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/` b. verify packages against repo `Packages.gz`
10. write staged `/etc/bee-release` (versions + git commit) c. extract headers for `bee-gpu-stress` build
11. patch staged `motd` with build metadata d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
12. copy `iso/builder/` into a temporary live-build workdir under `dist/` 8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
13. sync staged overlay into workdir `config/includes.chroot/` 9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
14. run `lb config && lb build` inside the privileged builder container 10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
12. write staged `/etc/bee-release` (versions + git commit)
13. patch staged `motd` with build metadata
14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
15. sync staged overlay into workdir `config/includes.chroot/`
16. run `lb config && lb build` inside the privileged builder container
``` ```
Build host notes:
- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
**Critical invariants:** **Critical invariants:**
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places: - `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build 1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
2. `auto/config``linux-image-${DEBIAN_KERNEL_ABI}` in the ISO 2. `auto/config``linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`. - NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay. - The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean. - The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly. - Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
## Post-boot smoke test ## Post-boot smoke test
@@ -131,10 +149,15 @@ Current validation state:
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal. Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
Acceptance flows: Acceptance flows:
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress` - `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
- `bee sat memory``memtester` archive - `bee sat memory``memtester` archive
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`) - SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
- Ada / Hopper: add `fp8`
- Blackwell+: add `fp4`
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
- Runtime overrides: - Runtime overrides:
- `BEE_GPU_STRESS_SECONDS` - `BEE_GPU_STRESS_SECONDS`
- `BEE_GPU_STRESS_SIZE_MB` - `BEE_GPU_STRESS_SIZE_MB`

View File

@@ -21,7 +21,8 @@ Fills gaps where Redfish/logpile is blind:
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
- Machine-readable health summary derived from collector verdicts - Machine-readable health summary derived from collector verdicts
- Operator-triggered acceptance tests for NVIDIA, memory, and storage - Operator-triggered acceptance tests for NVIDIA, memory, and storage
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress` - NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
- Automatic boot audit with operator-facing local console and SSH access - Automatic boot audit with operator-facing local console and SSH access
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi` - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
- SSH access (OpenSSH) always available for inspection and debugging - SSH access (OpenSSH) always available for inspection and debugging
@@ -69,6 +70,7 @@ Fills gaps where Redfish/logpile is blind:
| SSH | OpenSSH server | | SSH | OpenSSH server |
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers | | NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` | | NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
| Builder | Debian 12 host/VM or Debian 12 container image | | Builder | Debian 12 host/VM or Debian 12 container image |
## Operator UX ## Operator UX
@@ -78,6 +80,7 @@ Fills gaps where Redfish/logpile is blind:
- The TUI itself executes privileged actions as `root` via `sudo -n` - The TUI itself executes privileged actions as `root` via `sudo -n`
- SSH remains available independently of the local console path - SSH remains available independently of the local console path
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging - VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
## Runtime split ## Runtime split
@@ -85,6 +88,7 @@ Fills gaps where Redfish/logpile is blind:
- Live-ISO-only responsibilities stay in `iso/` integration code - Live-ISO-only responsibilities stay in `iso/` integration code
- Live ISO launches the Go CLI with `--runtime livecd` - Live ISO launches the Go CLI with `--runtime livecd`
- Local/manual runs use `--runtime auto` or `--runtime local` - Local/manual runs use `--runtime auto` or `--runtime local`
- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
## Key paths ## Key paths

58
iso/README.md Normal file
View File

@@ -0,0 +1,58 @@
# ISO Build
`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
## Requirements
- Docker Desktop or another Docker-compatible container runtime
- Privileged containers enabled
- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
## Build On macOS
From the repository root:
```sh
sh iso/builder/build-in-container.sh
```
The script defaults to `linux/amd64` builder containers, so it works on:
- Intel Mac
- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
You do not need to pass `--platform` manually for normal ISO builds.
## Useful Options
Build with explicit SSH keys baked into the ISO:
```sh
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
```
Rebuild the builder image:
```sh
sh iso/builder/build-in-container.sh --rebuild-image
```
Use a custom cache directory:
```sh
sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
```
## Notes
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
- Override the container platform only if you know why:
```sh
BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
```
- The shipped ISO is still `amd64`.
- Output ISO artifacts are written under `dist/`.

View File

@@ -4,5 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
NCCL_VERSION=2.28.9-1 NCCL_VERSION=2.28.9-1
NCCL_CUDA_VERSION=13.0 NCCL_CUDA_VERSION=13.0
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
CUBLAS_VERSION=13.0.2.14-1
CUDA_USERSPACE_VERSION=13.0.96-1
GO_VERSION=1.24.0 GO_VERSION=1.24.0
AUDIT_VERSION=1.0.0 AUDIT_VERSION=1.0.0

View File

@@ -32,6 +32,6 @@ lb config noauto \
--memtest none \ --memtest none \
--iso-volume "EASY-BEE" \ --iso-volume "EASY-BEE" \
--iso-application "EASY-BEE" \ --iso-application "EASY-BEE" \
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --bootappend-live "boot=live toram components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--apt-recommends false \ --apt-recommends false \
"${@}" "${@}"

File diff suppressed because it is too large Load Diff

170
iso/builder/build-cublas.sh Normal file
View File

@@ -0,0 +1,170 @@
#!/bin/sh
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
#
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
# verifies them against Packages.gz, and extracts the small subset we need:
# - headers for compiling bee-gpu-stress against cuBLASLt
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
set -e
CUBLAS_VERSION="$1"
CUDA_USERSPACE_VERSION="$2"
CUDA_SERIES="$3"
DIST_DIR="$4"
[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$CUDA_SERIES" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
&& [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
echo "=== cuBLAS cached, skipping download ==="
echo "cache: $CACHE_DIR"
exit 0
fi
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
echo "=== downloading Packages.gz ==="
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
lookup_pkg() {
pkg="$1"
ver="$2"
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
/^Package: / { cur_pkg=$2 }
/^Version: / { cur_ver=$2 }
/^Filename: / { cur_file=$2 }
/^SHA256: / { cur_sha=$2 }
/^$/ {
if (cur_pkg == pkg && cur_ver == ver) {
print cur_file " " cur_sha
exit
}
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
}
END {
if (cur_pkg == pkg && cur_ver == ver) {
print cur_file " " cur_sha
}
}'
}
download_verified_pkg() {
pkg="$1"
ver="$2"
meta="$(lookup_pkg "$pkg" "$ver")"
[ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
[ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
[ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
if [ -f "$out" ]; then
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
if [ "$actual_sha" = "$repo_sha" ]; then
echo "=== using cached $(basename "$repo_file") ==="
printf '%s\n' "$out"
return 0
fi
echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ==="
rm -f "$out"
fi
echo "=== downloading $(basename "$repo_file") ==="
wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
if [ "$actual_sha" != "$repo_sha" ]; then
echo "ERROR: sha256 mismatch for $(basename "$repo_file")"
echo " expected: $repo_sha"
echo " actual: $actual_sha"
rm -f "$out"
exit 1
fi
echo "sha256 OK: $(basename "$repo_file")"
printf '%s\n' "$out"
}
extract_deb() {
deb="$1"
dst="$2"
mkdir -p "$dst"
(
cd "$dst"
ar x "$deb"
data_tar=$(ls data.tar.* 2>/dev/null | head -1)
[ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
tar xf "$data_tar"
)
}
copy_headers() {
from="$1"
if [ -d "${from}/usr/include" ]; then
cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
fi
}
copy_libs() {
from="$1"
find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
\( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
}
make_links() {
base="$1"
versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
[ -n "$versioned" ] || return 0
soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
target=$(basename "$versioned")
ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
}
TMP_DIR=$(mktemp -d)
trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
copy_headers "${TMP_DIR}/cublas-dev"
copy_headers "${TMP_DIR}/cudart-dev"
copy_libs "${TMP_DIR}/cublas-rt"
copy_libs "${TMP_DIR}/cudart-rt"
make_links "libcublas"
make_links "libcublasLt"
make_links "libcudart"
[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
echo "=== cuBLAS extraction complete ==="
echo "cache: $CACHE_DIR"
echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"

View File

@@ -7,6 +7,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
BUILDER_DIR="${REPO_ROOT}/iso/builder" BUILDER_DIR="${REPO_ROOT}/iso/builder"
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}" CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}" IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}" CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
AUTH_KEYS="" AUTH_KEYS=""
REBUILD_IMAGE=0 REBUILD_IMAGE=0
@@ -40,6 +41,13 @@ if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
exit 1 exit 1
fi fi
PLATFORM_OS="${BUILDER_PLATFORM%/*}"
PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
exit 1
fi
if [ -n "$AUTH_KEYS" ]; then if [ -n "$AUTH_KEYS" ]; then
[ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; } [ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")" AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
@@ -56,17 +64,35 @@ mkdir -p \
IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}" IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then image_matches_platform() {
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
[ "$actual_platform" = "${BUILDER_PLATFORM}" ]
}
NEED_BUILD_IMAGE=0
if [ "$REBUILD_IMAGE" = "1" ]; then
NEED_BUILD_IMAGE=1
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
NEED_BUILD_IMAGE=1
elif ! image_matches_platform; then
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
NEED_BUILD_IMAGE=1
fi
if [ "$NEED_BUILD_IMAGE" = "1" ]; then
"$CONTAINER_TOOL" build \ "$CONTAINER_TOOL" build \
--platform "${BUILDER_PLATFORM}" \
--build-arg GO_VERSION="${GO_VERSION}" \ --build-arg GO_VERSION="${GO_VERSION}" \
-t "${IMAGE_REF}" \ -t "${IMAGE_REF}" \
"${BUILDER_DIR}" "${BUILDER_DIR}"
else else
echo "=== using existing builder image ${IMAGE_REF} ===" echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
fi fi
set -- \ set -- \
run --rm --privileged \ run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \ -v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \ -v "${CACHE_DIR}:/cache" \
-e BEE_CONTAINER_BUILD=1 \ -e BEE_CONTAINER_BUILD=1 \
@@ -80,6 +106,7 @@ set -- \
if [ -n "$AUTH_KEYS" ]; then if [ -n "$AUTH_KEYS" ]; then
set -- run --rm --privileged \ set -- run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \ -v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \ -v "${CACHE_DIR}:/cache" \
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \ -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \

View File

@@ -159,6 +159,16 @@ else
echo "=== bee binary up to date, skipping build ===" echo "=== bee binary up to date, skipping build ==="
fi fi
echo ""
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
sh "${BUILDER_DIR}/build-cublas.sh" \
"${CUBLAS_VERSION}" \
"${CUDA_USERSPACE_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
GPU_STRESS_NEED_BUILD=1 GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
GPU_STRESS_NEED_BUILD=0 GPU_STRESS_NEED_BUILD=0
@@ -167,6 +177,7 @@ fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
echo "=== building bee-gpu-stress ===" echo "=== building bee-gpu-stress ==="
gcc -O2 -s -Wall -Wextra \ gcc -O2 -s -Wall -Wextra \
-I"${CUBLAS_CACHE}/include" \
-o "$GPU_STRESS_BIN" \ -o "$GPU_STRESS_BIN" \
"${BUILDER_DIR}/bee-gpu-stress.c" \ "${BUILDER_DIR}/bee-gpu-stress.c" \
-ldl -ldl
@@ -283,6 +294,10 @@ NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# --- embed build metadata --- # --- embed build metadata ---
mkdir -p "${OVERLAY_STAGE_DIR}/etc" mkdir -p "${OVERLAY_STAGE_DIR}/etc"
BUILD_DATE="$(date +%Y-%m-%d)" BUILD_DATE="$(date +%Y-%m-%d)"
@@ -297,6 +312,8 @@ DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION} NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
NCCL_VERSION=${NCCL_VERSION} NCCL_VERSION=${NCCL_VERSION}
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION} NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
CUBLAS_VERSION=${CUBLAS_VERSION}
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
EOF EOF
# Patch motd with build info # Patch motd with build info

View File

@@ -20,6 +20,7 @@ openssh-server
# Filesystem support for USB export targets # Filesystem support for USB export targets
exfatprogs exfatprogs
exfat-fuse
ntfs-3g ntfs-3g
# Utilities # Utilities

View File

@@ -17,4 +17,5 @@ if [ -z "${SSH_CONNECTION:-}" ] \
&& [ "$(tty 2>/dev/null)" = "/dev/tty1" ]; then && [ "$(tty 2>/dev/null)" = "/dev/tty1" ]; then
echo "Bee live environment ready." echo "Bee live environment ready."
echo "Run 'menu' to open the TUI." echo "Run 'menu' to open the TUI."
echo "Kernel logs: Alt+F2 | Extra shell: Alt+F3"
fi fi

View File

@@ -1,4 +1,4 @@
[Journal] [Journal]
ForwardToConsole=yes ForwardToConsole=yes
TTYPath=/dev/ttyS0 TTYPath=/dev/ttyS0
MaxLevelConsole=debug MaxLevelConsole=info