Compare commits
4 Commits
iso/v1.0.1
...
iso/v1.0.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fc5c2019aa | ||
|
|
67a215c66f | ||
|
|
8b4bfdf5ad | ||
|
|
0a52a4f3ba |
@@ -231,8 +231,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
|
||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportLatestAudit(target)
|
||||
body := "Audit exported."
|
||||
if path != "" {
|
||||
body := "Audit export failed."
|
||||
if err == nil {
|
||||
body = "Audit exported."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Audit exported to " + path
|
||||
}
|
||||
return ActionResult{Title: "Export audit", Body: body}, err
|
||||
@@ -249,8 +252,11 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle exported. USB target unmounted and safe to remove."
|
||||
if path != "" {
|
||||
body := "Support bundle export failed."
|
||||
if err == nil {
|
||||
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
|
||||
@@ -470,6 +470,41 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.log: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
exports: fakeExports{
|
||||
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
|
||||
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
|
||||
if err == nil {
|
||||
t.Fatal("expected export error")
|
||||
}
|
||||
if contains(result.Body, "exported to") {
|
||||
t.Fatalf("body should not claim success:\n%s", result.Body)
|
||||
}
|
||||
if result.Body != "Support bundle export failed." {
|
||||
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -11,8 +11,48 @@ import (
|
||||
|
||||
var exportExecCommand = exec.Command
|
||||
|
||||
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
|
||||
msg := strings.TrimSpace(raw)
|
||||
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||
}
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
|
||||
func removableTargetReadOnly(fields map[string]string) bool {
|
||||
if fields["RO"] == "1" {
|
||||
return true
|
||||
}
|
||||
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
|
||||
case "iso9660", "squashfs":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func ensureWritableMountpoint(mountpoint string) error {
|
||||
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||
}
|
||||
name := probe.Name()
|
||||
if closeErr := probe.Close(); closeErr != nil {
|
||||
_ = os.Remove(name)
|
||||
return closeErr
|
||||
}
|
||||
if err := os.Remove(name); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -36,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if !removable || fields["FSTYPE"] == "" {
|
||||
if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -72,7 +112,7 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
|
||||
}
|
||||
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
_ = os.Remove(mountpoint)
|
||||
return string(raw), err
|
||||
return "", formatMountTargetError(target, string(raw), err)
|
||||
}
|
||||
mountedHere = true
|
||||
mounted = true
|
||||
@@ -95,6 +135,10 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
|
||||
}
|
||||
}()
|
||||
|
||||
if err := ensureWritableMountpoint(mountpoint); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
filename := filepath.Base(src)
|
||||
dst = filepath.Join(mountpoint, filename)
|
||||
data, err := os.ReadFile(src)
|
||||
|
||||
@@ -4,12 +4,11 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
@@ -54,3 +53,60 @@ func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
t.Fatalf("mkdir mountpoint: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
if err := os.Chmod(mountpoint, 0555); err != nil {
|
||||
t.Fatalf("chmod mountpoint: %v", err)
|
||||
}
|
||||
|
||||
oldExec := exportExecCommand
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "exit 0")
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
_, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
Mountpoint: mountpoint,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-writable mountpoint")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "target filesystem is not writable") {
|
||||
t.Fatalf("err=%q want writable message", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
|
||||
oldExec := exportExecCommand
|
||||
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
|
||||
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
|
||||
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
|
||||
return cmd
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
targets, err := s.ListRemovableTargets()
|
||||
if err != nil {
|
||||
t.Fatalf("ListRemovableTargets error: %v", err)
|
||||
}
|
||||
if len(targets) != 1 {
|
||||
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
|
||||
}
|
||||
if got := targets[0].Device; got != "/dev/sdb1" {
|
||||
t.Fatalf("device=%q want /dev/sdb1", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 {
|
||||
return v
|
||||
}
|
||||
|
||||
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
return sampleGPUMetrics(gpuIndices)
|
||||
}
|
||||
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
@@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
|
||||
// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
|
||||
// Each series is normalised to its own min–max and drawn in a different colour.
|
||||
// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
|
||||
func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
|
||||
if chartWidth < 20 {
|
||||
chartWidth = 70
|
||||
}
|
||||
const chartHeight = 14
|
||||
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
type seriesDef struct {
|
||||
label string
|
||||
color string
|
||||
unit string
|
||||
fn func(GPUMetricRow) float64
|
||||
}
|
||||
defs := []seriesDef{
|
||||
{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
gr := gpuMap[gpuIdx]
|
||||
if len(gr) == 0 {
|
||||
continue
|
||||
}
|
||||
elapsed := gr[len(gr)-1].ElapsedSec
|
||||
|
||||
// Build value slices for each series.
|
||||
type seriesData struct {
|
||||
seriesDef
|
||||
vals []float64
|
||||
mn float64
|
||||
mx float64
|
||||
}
|
||||
var series []seriesData
|
||||
for _, d := range defs {
|
||||
vals := extractGPUField(gr, d.fn)
|
||||
mn, mx := gpuMinMax(vals)
|
||||
if mn == mx {
|
||||
mx = mn + 1
|
||||
}
|
||||
series = append(series, seriesData{d, vals, mn, mx})
|
||||
}
|
||||
|
||||
// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
|
||||
type cell struct {
|
||||
ch rune
|
||||
color string
|
||||
}
|
||||
grid := make([][]cell, chartHeight+1)
|
||||
for r := range grid {
|
||||
grid[r] = make([]cell, chartWidth)
|
||||
for c := range grid[r] {
|
||||
grid[r][c] = cell{' ', ""}
|
||||
}
|
||||
}
|
||||
|
||||
// Plot each series onto the shared grid.
|
||||
for _, s := range series {
|
||||
w := chartWidth
|
||||
if len(s.vals) < w {
|
||||
w = len(s.vals)
|
||||
}
|
||||
data := gpuDownsample(s.vals, w)
|
||||
prevRow := -1
|
||||
for x, v := range data {
|
||||
row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
|
||||
if row < 0 {
|
||||
row = 0
|
||||
}
|
||||
if row > chartHeight {
|
||||
row = chartHeight
|
||||
}
|
||||
if prevRow < 0 || prevRow == row {
|
||||
grid[row][x] = cell{'─', s.color}
|
||||
} else {
|
||||
lo, hi := prevRow, row
|
||||
if lo > hi {
|
||||
lo, hi = hi, lo
|
||||
}
|
||||
for y := lo + 1; y < hi; y++ {
|
||||
grid[y][x] = cell{'│', s.color}
|
||||
}
|
||||
if prevRow < row {
|
||||
grid[prevRow][x] = cell{'╮', s.color}
|
||||
grid[row][x] = cell{'╰', s.color}
|
||||
} else {
|
||||
grid[prevRow][x] = cell{'╯', s.color}
|
||||
grid[row][x] = cell{'╭', s.color}
|
||||
}
|
||||
}
|
||||
prevRow = row
|
||||
}
|
||||
}
|
||||
|
||||
// Render: Y axis + data rows.
|
||||
fmt.Fprintf(&b, "GPU %d (%.0fs) each series normalised to its range\n", gpuIdx, elapsed)
|
||||
for r := 0; r <= chartHeight; r++ {
|
||||
// Y axis label: 100% at top, 50% in middle, 0% at bottom.
|
||||
switch r {
|
||||
case 0:
|
||||
fmt.Fprintf(&b, "%4s┤", "100%")
|
||||
case chartHeight / 2:
|
||||
fmt.Fprintf(&b, "%4s┤", "50%")
|
||||
case chartHeight:
|
||||
fmt.Fprintf(&b, "%4s┤", "0%")
|
||||
default:
|
||||
fmt.Fprintf(&b, "%4s│", "")
|
||||
}
|
||||
for c := 0; c < chartWidth; c++ {
|
||||
cl := grid[r][c]
|
||||
if cl.color != "" {
|
||||
b.WriteString(cl.color)
|
||||
b.WriteRune(cl.ch)
|
||||
b.WriteString(ansiReset)
|
||||
} else {
|
||||
b.WriteRune(' ')
|
||||
}
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
// Bottom axis.
|
||||
b.WriteString(" └")
|
||||
b.WriteString(strings.Repeat("─", chartWidth))
|
||||
b.WriteRune('\n')
|
||||
|
||||
// Legend with current (last) values.
|
||||
b.WriteString(" ")
|
||||
for i, s := range series {
|
||||
last := s.vals[len(s.vals)-1]
|
||||
b.WriteString(s.color)
|
||||
fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
|
||||
b.WriteString(ansiReset)
|
||||
if i < len(series)-1 {
|
||||
b.WriteString(" ")
|
||||
}
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
|
||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
||||
|
||||
@@ -151,8 +151,10 @@ func (m model) confirmCancelTarget() screen {
|
||||
switch m.pendingAction {
|
||||
case actionExportBundle:
|
||||
return screenExportTargets
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
||||
return screenHealthCheck
|
||||
case actionRunFanStress:
|
||||
return screenBurnInTests
|
||||
default:
|
||||
return screenMain
|
||||
}
|
||||
@@ -165,9 +167,9 @@ func hcFanStressOpts(hcMode int, application interface {
|
||||
// Phase durations per mode: [baseline, load1, pause, load2]
|
||||
type durations struct{ baseline, load1, pause, load2 int }
|
||||
modes := [3]durations{
|
||||
{30, 120, 30, 120}, // Quick: ~5 min total
|
||||
{60, 300, 60, 300}, // Standard: ~12 min total
|
||||
{60, 600, 120, 600}, // Express: ~24 min total
|
||||
{30, 120, 30, 120}, // Quick: ~5 min total
|
||||
{60, 300, 60, 300}, // Standard: ~12 min total
|
||||
{60, 600, 120, 600}, // Express: ~24 min total
|
||||
}
|
||||
if hcMode < 0 || hcMode >= len(modes) {
|
||||
hcMode = 0
|
||||
@@ -182,12 +184,13 @@ func hcFanStressOpts(hcMode int, application interface {
|
||||
}
|
||||
}
|
||||
|
||||
// Use minimum GPU memory size to fit all GPUs.
|
||||
// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
|
||||
sizeMB := 64
|
||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||
for _, g := range gpus {
|
||||
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
|
||||
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
|
||||
free := g.MemoryMB - 512
|
||||
if free > 0 && (sizeMB == 64 || free < sizeMB) {
|
||||
sizeMB = free
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
|
||||
body string
|
||||
err error
|
||||
}
|
||||
|
||||
type gpuLiveTickMsg struct {
|
||||
rows []platform.GPUMetricRow
|
||||
indices []int
|
||||
}
|
||||
|
||||
117
audit/internal/tui/screen_burn_in.go
Normal file
117
audit/internal/tui/screen_burn_in.go
Normal file
@@ -0,0 +1,117 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
const (
|
||||
burnCurGPUStress = 0
|
||||
burnCurModeQuick = 1
|
||||
burnCurModeStd = 2
|
||||
burnCurModeExpr = 3
|
||||
burnCurRun = 4
|
||||
burnCurTotal = 5
|
||||
)
|
||||
|
||||
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
|
||||
m.screen = screenBurnInTests
|
||||
m.cursor = 0
|
||||
if !m.burnInitialized {
|
||||
m.burnMode = 0
|
||||
m.burnCursor = 0
|
||||
m.burnInitialized = true
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "up", "k":
|
||||
if m.burnCursor > 0 {
|
||||
m.burnCursor--
|
||||
}
|
||||
case "down", "j":
|
||||
if m.burnCursor < burnCurTotal-1 {
|
||||
m.burnCursor++
|
||||
}
|
||||
case " ":
|
||||
switch m.burnCursor {
|
||||
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
|
||||
m.burnMode = m.burnCursor - burnCurModeQuick
|
||||
}
|
||||
case "enter":
|
||||
switch m.burnCursor {
|
||||
case burnCurGPUStress, burnCurRun:
|
||||
return m.burnRunSelected()
|
||||
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
|
||||
m.burnMode = m.burnCursor - burnCurModeQuick
|
||||
}
|
||||
case "f", "F", "r", "R":
|
||||
return m.burnRunSelected()
|
||||
case "1":
|
||||
m.burnMode = 0
|
||||
case "2":
|
||||
m.burnMode = 1
|
||||
case "3":
|
||||
m.burnMode = 2
|
||||
case "esc":
|
||||
m.screen = screenMain
|
||||
m.cursor = 1
|
||||
case "q", "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
|
||||
return m.hcRunFanStress()
|
||||
}
|
||||
|
||||
func renderBurnInTests(m model) string {
|
||||
var b strings.Builder
|
||||
|
||||
fmt.Fprintln(&b, "BURN-IN TESTS")
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, " Stress tests:")
|
||||
fmt.Fprintln(&b)
|
||||
|
||||
pfx := " "
|
||||
if m.burnCursor == burnCurGPUStress {
|
||||
pfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, " Mode:")
|
||||
modes := []struct{ label, key string }{
|
||||
{"Quick", "1"},
|
||||
{"Standard", "2"},
|
||||
{"Express", "3"},
|
||||
}
|
||||
for i, mode := range modes {
|
||||
pfx := " "
|
||||
if m.burnCursor == burnCurModeQuick+i {
|
||||
pfx = "> "
|
||||
}
|
||||
radio := "( )"
|
||||
if m.burnMode == i {
|
||||
radio = "(*)"
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key)
|
||||
}
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
pfx = " "
|
||||
if m.burnCursor == burnCurRun {
|
||||
pfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back")
|
||||
return b.String()
|
||||
}
|
||||
@@ -4,7 +4,12 @@ import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.targets) == 0 {
|
||||
return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain)
|
||||
return m, resultCmd(
|
||||
"Export support bundle",
|
||||
"No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list.",
|
||||
nil,
|
||||
screenMain,
|
||||
)
|
||||
}
|
||||
target := m.targets[m.cursor]
|
||||
m.selectedTarget = &target
|
||||
|
||||
@@ -3,8 +3,10 @@ package tui
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
@@ -19,17 +21,16 @@ const (
|
||||
|
||||
// Cursor positions in Health Check screen.
|
||||
const (
|
||||
hcCurGPU = 0
|
||||
hcCurMemory = 1
|
||||
hcCurStorage = 2
|
||||
hcCurCPU = 3
|
||||
hcCurSelectAll = 4
|
||||
hcCurModeQuick = 5
|
||||
hcCurModeStd = 6
|
||||
hcCurModeExpr = 7
|
||||
hcCurRunAll = 8
|
||||
hcCurFanStress = 9
|
||||
hcCurTotal = 10
|
||||
hcCurGPU = 0
|
||||
hcCurMemory = 1
|
||||
hcCurStorage = 2
|
||||
hcCurCPU = 3
|
||||
hcCurSelectAll = 4
|
||||
hcCurModeQuick = 5
|
||||
hcCurModeStd = 6
|
||||
hcCurModeExpr = 7
|
||||
hcCurRunAll = 8
|
||||
hcCurTotal = 9
|
||||
)
|
||||
|
||||
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
||||
@@ -84,8 +85,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
m.hcMode = m.hcCursor - hcCurModeQuick
|
||||
case hcCurRunAll:
|
||||
return m.hcRunAll()
|
||||
case hcCurFanStress:
|
||||
return m.hcRunFanStress()
|
||||
}
|
||||
case "g", "G":
|
||||
return m.hcRunSingle(hcGPU)
|
||||
@@ -97,8 +96,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m.hcRunSingle(hcCPU)
|
||||
case "r", "R":
|
||||
return m.hcRunAll()
|
||||
case "f", "F":
|
||||
return m.hcRunFanStress()
|
||||
case "a", "A":
|
||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
||||
for i := range m.hcSel {
|
||||
@@ -156,14 +153,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
|
||||
// nvtop occupies the full terminal as a live chart; the stress test runs in background.
|
||||
// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
|
||||
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
||||
opts := hcFanStressOpts(m.hcMode, m.app)
|
||||
opts := hcFanStressOpts(m.burnMode, m.app)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
m.gpuStressCancel = cancel
|
||||
m.gpuStressAborted = false
|
||||
m.gpuLiveRows = nil
|
||||
m.gpuLiveIndices = opts.GPUIndices
|
||||
m.gpuLiveStart = time.Now()
|
||||
m.screen = screenGPUStressRunning
|
||||
m.nvidiaSATCursor = 0
|
||||
|
||||
@@ -172,37 +171,29 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
||||
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
|
||||
}
|
||||
|
||||
nvtopPath, lookErr := exec.LookPath("nvtop")
|
||||
if lookErr != nil {
|
||||
return m, stressCmd
|
||||
}
|
||||
return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
|
||||
}
|
||||
|
||||
return m, tea.Batch(
|
||||
stressCmd,
|
||||
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
}),
|
||||
)
|
||||
// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
|
||||
// The update handler reschedules it to achieve continuous 1s polling.
|
||||
func pollGPULive(indices []int) tea.Cmd {
|
||||
return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
|
||||
rows, _ := platform.SampleGPUMetrics(indices)
|
||||
return gpuLiveTickMsg{rows: rows, indices: indices}
|
||||
})
|
||||
}
|
||||
|
||||
// updateGPUStressRunning handles keys on the GPU stress running screen.
|
||||
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "o", "O":
|
||||
nvtopPath, err := exec.LookPath("nvtop")
|
||||
if err != nil {
|
||||
return m, nil
|
||||
}
|
||||
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
})
|
||||
case "a", "A":
|
||||
if m.gpuStressCancel != nil {
|
||||
m.gpuStressCancel()
|
||||
m.gpuStressCancel = nil
|
||||
}
|
||||
m.gpuStressAborted = true
|
||||
m.screen = screenHealthCheck
|
||||
m.screen = screenBurnInTests
|
||||
m.burnCursor = burnCurGPUStress
|
||||
m.cursor = 0
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
@@ -210,8 +201,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func renderGPUStressRunning() string {
|
||||
return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
|
||||
func renderGPUStressRunning(m model) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
|
||||
fmt.Fprintln(&b)
|
||||
if len(m.gpuLiveRows) == 0 {
|
||||
fmt.Fprintln(&b, "Collecting metrics...")
|
||||
} else {
|
||||
chartWidth := m.width - 8
|
||||
if chartWidth < 40 {
|
||||
chartWidth = 70
|
||||
}
|
||||
b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
b.WriteString("[a] Abort test [ctrl+c] quit")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
||||
@@ -371,16 +376,8 @@ func renderHealthCheck(m model) string {
|
||||
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
|
||||
}
|
||||
|
||||
{
|
||||
pfx := " "
|
||||
if m.hcCursor == hcCurFanStress {
|
||||
pfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
||||
}
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] gpu stress [Esc] back")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -8,7 +8,9 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
|
||||
switch m.cursor {
|
||||
case 0: // Health Check
|
||||
return m.enterHealthCheck()
|
||||
case 1: // Export support bundle
|
||||
case 1: // Burn-in tests
|
||||
return m.enterBurnInTests()
|
||||
case 2: // Export support bundle
|
||||
m.pendingAction = actionExportBundle
|
||||
m.busy = true
|
||||
m.busyTitle = "Export support bundle"
|
||||
@@ -16,11 +18,11 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
|
||||
targets, err := m.app.ListRemovableTargets()
|
||||
return exportTargetsMsg{targets: targets, err: err}
|
||||
}
|
||||
case 2: // Settings
|
||||
case 3: // Settings
|
||||
m.screen = screenSettings
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case 3: // Exit
|
||||
case 4: // Exit
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
|
||||
@@ -54,9 +54,10 @@ func TestUpdateMainMenuEnterActions(t *testing.T) {
|
||||
wantCmd bool
|
||||
}{
|
||||
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true},
|
||||
{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
{name: "settings", cursor: 2, wantScreen: screenSettings, wantCmd: true},
|
||||
{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true},
|
||||
{name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests, wantCmd: true},
|
||||
{name: "export", cursor: 2, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
{name: "settings", cursor: 3, wantScreen: screenSettings, wantCmd: true},
|
||||
{name: "exit", cursor: 4, wantScreen: screenMain, wantCmd: true},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
@@ -115,7 +116,8 @@ func TestMainMenuSimpleTransitions(t *testing.T) {
|
||||
wantScreen screen
|
||||
}{
|
||||
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
|
||||
{name: "settings", cursor: 2, wantScreen: screenSettings},
|
||||
{name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests},
|
||||
{name: "settings", cursor: 3, wantScreen: screenSettings},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
@@ -146,7 +148,7 @@ func TestMainMenuExportSetsBusy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = 1 // Export support bundle
|
||||
m.cursor = 2 // Export support bundle
|
||||
|
||||
next, cmd := m.handleMainMenu()
|
||||
got := next.(model)
|
||||
@@ -163,12 +165,13 @@ func TestMainViewRendersTwoColumns(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = 1
|
||||
m.cursor = 2
|
||||
|
||||
view := m.View()
|
||||
for _, want := range []string{
|
||||
"bee",
|
||||
"Health Check",
|
||||
"Burn-in tests",
|
||||
"> Export support bundle",
|
||||
"Settings",
|
||||
"Exit",
|
||||
@@ -400,6 +403,11 @@ func TestConfirmCancelTarget(t *testing.T) {
|
||||
t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
|
||||
}
|
||||
|
||||
m.pendingAction = actionRunFanStress
|
||||
if got := m.confirmCancelTarget(); got != screenBurnInTests {
|
||||
t.Fatalf("fan stress cancel target=%q want %q", got, screenBurnInTests)
|
||||
}
|
||||
|
||||
m.pendingAction = actionNone
|
||||
if got := m.confirmCancelTarget(); got != screenMain {
|
||||
t.Fatalf("default cancel target=%q want %q", got, screenMain)
|
||||
@@ -439,6 +447,68 @@ func TestViewBusyStateUsesBusyTitle(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnInTestsEscReturnsToMain(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenBurnInTests
|
||||
m.burnCursor = 3
|
||||
|
||||
next, _ := m.updateBurnInTests(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenMain {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenMain)
|
||||
}
|
||||
if got.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1", got.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnInTestsRunOpensConfirm(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenBurnInTests
|
||||
m.burnInitialized = true
|
||||
m.burnMode = 2
|
||||
|
||||
next, _ := m.burnRunSelected()
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenConfirm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
|
||||
}
|
||||
if got.pendingAction != actionRunFanStress {
|
||||
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunFanStress)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewBurnInTestsRendersGPUStressEntry(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenBurnInTests
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"BURN-IN TESTS",
|
||||
"GPU PLATFORM STRESS TEST",
|
||||
"Quick",
|
||||
"Standard",
|
||||
"Express",
|
||||
"[ RUN SELECTED [R] ]",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -528,7 +598,7 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
|
||||
|
||||
for _, want := range []string{
|
||||
"Export support bundle",
|
||||
"Select removable filesystem",
|
||||
"Select writable removable filesystem (read-only/boot media hidden)",
|
||||
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
@@ -537,6 +607,32 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportTargetsMsgEmptyShowsHiddenBootMediaHint(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
m.busyTitle = "Export support bundle"
|
||||
|
||||
next, _ := m.Update(exportTargetsMsg{})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenOutput {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenOutput)
|
||||
}
|
||||
if got.title != "Export support bundle" {
|
||||
t.Fatalf("title=%q want %q", got.title, "Export support bundle")
|
||||
}
|
||||
for _, want := range []string{
|
||||
"No writable removable filesystems found.",
|
||||
"Read-only or boot media are hidden from this list.",
|
||||
} {
|
||||
if !strings.Contains(got.body, want) {
|
||||
t.Fatalf("body missing %q\nbody:\n%s", want, got.body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewStaticFormRendersFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ type screen string
|
||||
const (
|
||||
screenMain screen = "main"
|
||||
screenHealthCheck screen = "health_check"
|
||||
screenBurnInTests screen = "burn_in_tests"
|
||||
screenSettings screen = "settings"
|
||||
screenNetwork screen = "network"
|
||||
screenInterfacePick screen = "interface_pick"
|
||||
@@ -41,8 +42,8 @@ const (
|
||||
actionRunMemorySAT actionKind = "run_memory_sat"
|
||||
actionRunStorageSAT actionKind = "run_storage_sat"
|
||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
actionRunFanStress actionKind = "run_fan_stress"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
actionRunFanStress actionKind = "run_fan_stress"
|
||||
)
|
||||
|
||||
type model struct {
|
||||
@@ -84,6 +85,11 @@ type model struct {
|
||||
hcCursor int
|
||||
hcInitialized bool
|
||||
|
||||
// Burn-in tests screen
|
||||
burnMode int
|
||||
burnCursor int
|
||||
burnInitialized bool
|
||||
|
||||
// NVIDIA SAT setup
|
||||
nvidiaGPUs []platform.NvidiaGPU
|
||||
nvidiaGPUSel []bool
|
||||
@@ -97,6 +103,9 @@ type model struct {
|
||||
// GPU Platform Stress Test running
|
||||
gpuStressCancel func()
|
||||
gpuStressAborted bool
|
||||
gpuLiveRows []platform.GPUMetricRow
|
||||
gpuLiveIndices []int
|
||||
gpuLiveStart time.Time
|
||||
|
||||
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
|
||||
progressLines []string
|
||||
@@ -129,6 +138,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
|
||||
screen: screenMain,
|
||||
mainMenu: []string{
|
||||
"Health Check",
|
||||
"Burn-in tests",
|
||||
"Export support bundle",
|
||||
"Settings",
|
||||
"Exit",
|
||||
@@ -198,7 +208,7 @@ func (m model) confirmBody() (string, string) {
|
||||
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
||||
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
||||
"Monitors fans, temps, power — detects throttling.\n" +
|
||||
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
|
||||
"Mode: " + modes[m.burnMode] + "\n\nAll NVIDIA GPUs will be stressed."
|
||||
default:
|
||||
return "Confirm", "Proceed?"
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package tui
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
@@ -100,6 +101,13 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
m.screen = screenOutput
|
||||
return m, m.refreshSnapshotCmd()
|
||||
}
|
||||
if len(msg.targets) == 0 {
|
||||
m.title = "Export support bundle"
|
||||
m.body = "No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list."
|
||||
m.prevScreen = screenMain
|
||||
m.screen = screenOutput
|
||||
return m, m.refreshSnapshotCmd()
|
||||
}
|
||||
m.targets = msg.targets
|
||||
m.screen = screenExportTargets
|
||||
m.cursor = 0
|
||||
@@ -116,7 +124,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
m.gpuStressCancel()
|
||||
m.gpuStressCancel = nil
|
||||
}
|
||||
m.prevScreen = screenHealthCheck
|
||||
m.prevScreen = screenBurnInTests
|
||||
m.screen = screenOutput
|
||||
m.title = msg.title
|
||||
if msg.err != nil {
|
||||
@@ -130,6 +138,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
m.body = msg.body
|
||||
}
|
||||
return m, m.refreshSnapshotCmd()
|
||||
case gpuLiveTickMsg:
|
||||
if m.screen == screenGPUStressRunning {
|
||||
if len(msg.rows) > 0 {
|
||||
elapsed := time.Since(m.gpuLiveStart).Seconds()
|
||||
for i := range msg.rows {
|
||||
msg.rows[i].ElapsedSec = elapsed
|
||||
}
|
||||
m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
|
||||
n := max(1, len(msg.indices))
|
||||
if len(m.gpuLiveRows) > 60*n {
|
||||
m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
|
||||
}
|
||||
}
|
||||
return m, pollGPULive(msg.indices)
|
||||
}
|
||||
return m, nil
|
||||
case nvidiaSATDoneMsg:
|
||||
if m.nvidiaSATAborted {
|
||||
return m, nil
|
||||
@@ -162,6 +186,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m.updateMain(msg)
|
||||
case screenHealthCheck:
|
||||
return m.updateHealthCheck(msg)
|
||||
case screenBurnInTests:
|
||||
return m.updateBurnInTests(msg)
|
||||
case screenSettings:
|
||||
return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
|
||||
case screenNetwork:
|
||||
|
||||
@@ -57,6 +57,8 @@ func (m model) View() string {
|
||||
body = renderTwoColumnMain(m)
|
||||
case screenHealthCheck:
|
||||
body = renderHealthCheck(m)
|
||||
case screenBurnInTests:
|
||||
body = renderBurnInTests(m)
|
||||
case screenSettings:
|
||||
body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
|
||||
case screenNetwork:
|
||||
@@ -66,7 +68,12 @@ func (m model) View() string {
|
||||
case screenServiceAction:
|
||||
body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
|
||||
case screenExportTargets:
|
||||
body = renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
|
||||
body = renderMenu(
|
||||
"Export support bundle",
|
||||
"Select writable removable filesystem (read-only/boot media hidden)",
|
||||
renderTargetItems(m.targets),
|
||||
m.cursor,
|
||||
)
|
||||
case screenInterfacePick:
|
||||
body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
|
||||
case screenStaticForm:
|
||||
@@ -79,7 +86,7 @@ func (m model) View() string {
|
||||
case screenNvidiaSATRunning:
|
||||
body = renderNvidiaSATRunning()
|
||||
case screenGPUStressRunning:
|
||||
body = renderGPUStressRunning()
|
||||
body = renderGPUStressRunning(m)
|
||||
case screenOutput:
|
||||
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
||||
default:
|
||||
|
||||
@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.
|
||||
|
||||
## Boot sequence (single ISO)
|
||||
|
||||
The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
|
||||
|
||||
`systemd` boot order:
|
||||
|
||||
```
|
||||
@@ -25,6 +27,7 @@ local-fs.target
|
||||
```
|
||||
|
||||
**Critical invariants:**
|
||||
- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
|
||||
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
|
||||
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
|
||||
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
|
||||
@@ -71,24 +74,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
|
||||
d. build kernel modules against Debian headers
|
||||
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
|
||||
f. cache in `dist/nvidia-<version>-<kver>/`
|
||||
7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||
8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||
9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/`
|
||||
10. write staged `/etc/bee-release` (versions + git commit)
|
||||
11. patch staged `motd` with build metadata
|
||||
12. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
||||
13. sync staged overlay into workdir `config/includes.chroot/`
|
||||
14. run `lb config && lb build` inside the privileged builder container
|
||||
7. `build-cublas.sh`:
|
||||
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
||||
b. verify packages against repo `Packages.gz`
|
||||
c. extract headers for `bee-gpu-stress` build
|
||||
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
||||
8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
|
||||
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
||||
12. write staged `/etc/bee-release` (versions + git commit)
|
||||
13. patch staged `motd` with build metadata
|
||||
14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
||||
15. sync staged overlay into workdir `config/includes.chroot/`
|
||||
16. run `lb config && lb build` inside the privileged builder container
|
||||
```
|
||||
|
||||
Build host notes:
|
||||
- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
|
||||
- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
|
||||
- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
|
||||
|
||||
**Critical invariants:**
|
||||
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
|
||||
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
||||
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
||||
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
||||
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
||||
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
||||
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
|
||||
- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
|
||||
- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
|
||||
|
||||
## Post-boot smoke test
|
||||
|
||||
@@ -131,10 +149,15 @@ Current validation state:
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
Acceptance flows:
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
|
||||
- `bee sat memory` → `memtester` archive
|
||||
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
||||
- Ada / Hopper: add `fp8`
|
||||
- Blackwell+: add `fp4`
|
||||
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
||||
- Runtime overrides:
|
||||
- `BEE_GPU_STRESS_SECONDS`
|
||||
- `BEE_GPU_STRESS_SIZE_MB`
|
||||
|
||||
@@ -21,7 +21,8 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
||||
- Machine-readable health summary derived from collector verdicts
|
||||
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
||||
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
|
||||
- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
|
||||
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||
- Automatic boot audit with operator-facing local console and SSH access
|
||||
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
||||
- SSH access (OpenSSH) always available for inspection and debugging
|
||||
@@ -69,6 +70,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| SSH | OpenSSH server |
|
||||
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
||||
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
||||
| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||
| Builder | Debian 12 host/VM or Debian 12 container image |
|
||||
|
||||
## Operator UX
|
||||
@@ -78,6 +80,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- The TUI itself executes privileged actions as `root` via `sudo -n`
|
||||
- SSH remains available independently of the local console path
|
||||
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
|
||||
- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
|
||||
|
||||
## Runtime split
|
||||
|
||||
@@ -85,6 +88,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- Live-ISO-only responsibilities stay in `iso/` integration code
|
||||
- Live ISO launches the Go CLI with `--runtime livecd`
|
||||
- Local/manual runs use `--runtime auto` or `--runtime local`
|
||||
- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
|
||||
|
||||
## Key paths
|
||||
|
||||
|
||||
58
iso/README.md
Normal file
58
iso/README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# ISO Build
|
||||
|
||||
`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Docker Desktop or another Docker-compatible container runtime
|
||||
- Privileged containers enabled
|
||||
- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
|
||||
|
||||
## Build On macOS
|
||||
|
||||
From the repository root:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh
|
||||
```
|
||||
|
||||
The script defaults to `linux/amd64` builder containers, so it works on:
|
||||
|
||||
- Intel Mac
|
||||
- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
|
||||
|
||||
You do not need to pass `--platform` manually for normal ISO builds.
|
||||
|
||||
## Useful Options
|
||||
|
||||
Build with explicit SSH keys baked into the ISO:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
||||
```
|
||||
|
||||
Rebuild the builder image:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --rebuild-image
|
||||
```
|
||||
|
||||
Use a custom cache directory:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||
- Override the container platform only if you know why:
|
||||
|
||||
```sh
|
||||
BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
|
||||
```
|
||||
|
||||
- The shipped ISO is still `amd64`.
|
||||
- Output ISO artifacts are written under `dist/`.
|
||||
@@ -4,5 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=1.0.0
|
||||
|
||||
@@ -32,6 +32,6 @@ lb config noauto \
|
||||
--memtest none \
|
||||
--iso-volume "EASY-BEE" \
|
||||
--iso-application "EASY-BEE" \
|
||||
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live toram components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--apt-recommends false \
|
||||
"${@}"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
170
iso/builder/build-cublas.sh
Normal file
170
iso/builder/build-cublas.sh
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/bin/sh
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
|
||||
#
|
||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||
# - headers for compiling bee-gpu-stress against cuBLASLt
|
||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||
|
||||
set -e
|
||||
|
||||
CUBLAS_VERSION="$1"
|
||||
CUDA_USERSPACE_VERSION="$2"
|
||||
CUDA_SERIES="$3"
|
||||
DIST_DIR="$4"
|
||||
|
||||
[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$CUDA_SERIES" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
|
||||
CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||
CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
|
||||
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
||||
|
||||
echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
|
||||
|
||||
if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
|
||||
&& [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== cuBLAS cached, skipping download ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
|
||||
|
||||
echo "=== downloading Packages.gz ==="
|
||||
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
|
||||
|
||||
lookup_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2"
|
||||
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
|
||||
/^Package: / { cur_pkg=$2 }
|
||||
/^Version: / { cur_ver=$2 }
|
||||
/^Filename: / { cur_file=$2 }
|
||||
/^SHA256: / { cur_sha=$2 }
|
||||
/^$/ {
|
||||
if (cur_pkg == pkg && cur_ver == ver) {
|
||||
print cur_file " " cur_sha
|
||||
exit
|
||||
}
|
||||
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
|
||||
}
|
||||
END {
|
||||
if (cur_pkg == pkg && cur_ver == ver) {
|
||||
print cur_file " " cur_sha
|
||||
}
|
||||
}'
|
||||
}
|
||||
|
||||
download_verified_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2"
|
||||
|
||||
meta="$(lookup_pkg "$pkg" "$ver")"
|
||||
[ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
|
||||
|
||||
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
|
||||
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
|
||||
[ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
|
||||
[ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
|
||||
|
||||
out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
|
||||
if [ -f "$out" ]; then
|
||||
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||
if [ "$actual_sha" = "$repo_sha" ]; then
|
||||
echo "=== using cached $(basename "$repo_file") ==="
|
||||
printf '%s\n' "$out"
|
||||
return 0
|
||||
fi
|
||||
echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ==="
|
||||
rm -f "$out"
|
||||
fi
|
||||
|
||||
echo "=== downloading $(basename "$repo_file") ==="
|
||||
wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
|
||||
|
||||
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||
if [ "$actual_sha" != "$repo_sha" ]; then
|
||||
echo "ERROR: sha256 mismatch for $(basename "$repo_file")"
|
||||
echo " expected: $repo_sha"
|
||||
echo " actual: $actual_sha"
|
||||
rm -f "$out"
|
||||
exit 1
|
||||
fi
|
||||
echo "sha256 OK: $(basename "$repo_file")"
|
||||
printf '%s\n' "$out"
|
||||
}
|
||||
|
||||
extract_deb() {
|
||||
deb="$1"
|
||||
dst="$2"
|
||||
mkdir -p "$dst"
|
||||
(
|
||||
cd "$dst"
|
||||
ar x "$deb"
|
||||
data_tar=$(ls data.tar.* 2>/dev/null | head -1)
|
||||
[ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
|
||||
tar xf "$data_tar"
|
||||
)
|
||||
}
|
||||
|
||||
copy_headers() {
|
||||
from="$1"
|
||||
if [ -d "${from}/usr/include" ]; then
|
||||
cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
|
||||
fi
|
||||
}
|
||||
|
||||
copy_libs() {
|
||||
from="$1"
|
||||
find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
|
||||
\( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||
}
|
||||
|
||||
make_links() {
|
||||
base="$1"
|
||||
versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
|
||||
[ -n "$versioned" ] || return 0
|
||||
soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
|
||||
target=$(basename "$versioned")
|
||||
ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
|
||||
ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
|
||||
}
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
|
||||
|
||||
CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||
CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||
CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||
CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||
|
||||
extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
|
||||
extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
|
||||
extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
|
||||
extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
|
||||
|
||||
copy_headers "${TMP_DIR}/cublas-dev"
|
||||
copy_headers "${TMP_DIR}/cudart-dev"
|
||||
copy_libs "${TMP_DIR}/cublas-rt"
|
||||
copy_libs "${TMP_DIR}/cudart-rt"
|
||||
|
||||
make_links "libcublas"
|
||||
make_links "libcublasLt"
|
||||
make_links "libcudart"
|
||||
|
||||
[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
|
||||
[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
|
||||
|
||||
echo "=== cuBLAS extraction complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
|
||||
echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"
|
||||
@@ -7,6 +7,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
||||
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
AUTH_KEYS=""
|
||||
REBUILD_IMAGE=0
|
||||
@@ -40,6 +41,13 @@ if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PLATFORM_OS="${BUILDER_PLATFORM%/*}"
|
||||
PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
|
||||
if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
|
||||
echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
[ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
|
||||
AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
|
||||
@@ -56,17 +64,35 @@ mkdir -p \
|
||||
|
||||
IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
|
||||
|
||||
if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
image_matches_platform() {
|
||||
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
|
||||
[ "$actual_platform" = "${BUILDER_PLATFORM}" ]
|
||||
}
|
||||
|
||||
NEED_BUILD_IMAGE=0
|
||||
if [ "$REBUILD_IMAGE" = "1" ]; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! image_matches_platform; then
|
||||
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
|
||||
echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
|
||||
NEED_BUILD_IMAGE=1
|
||||
fi
|
||||
|
||||
if [ "$NEED_BUILD_IMAGE" = "1" ]; then
|
||||
"$CONTAINER_TOOL" build \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
--build-arg GO_VERSION="${GO_VERSION}" \
|
||||
-t "${IMAGE_REF}" \
|
||||
"${BUILDER_DIR}"
|
||||
else
|
||||
echo "=== using existing builder image ${IMAGE_REF} ==="
|
||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||
fi
|
||||
|
||||
set -- \
|
||||
run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
@@ -80,6 +106,7 @@ set -- \
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
set -- run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
|
||||
@@ -159,6 +159,16 @@ else
|
||||
echo "=== bee binary up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
@@ -167,6 +177,7 @@ fi
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl
|
||||
@@ -283,6 +294,10 @@ NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- embed build metadata ---
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||
@@ -297,6 +312,8 @@ DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
EOF
|
||||
|
||||
# Patch motd with build info
|
||||
|
||||
@@ -20,6 +20,7 @@ openssh-server
|
||||
|
||||
# Filesystem support for USB export targets
|
||||
exfatprogs
|
||||
exfat-fuse
|
||||
ntfs-3g
|
||||
|
||||
# Utilities
|
||||
|
||||
@@ -17,4 +17,5 @@ if [ -z "${SSH_CONNECTION:-}" ] \
|
||||
&& [ "$(tty 2>/dev/null)" = "/dev/tty1" ]; then
|
||||
echo "Bee live environment ready."
|
||||
echo "Run 'menu' to open the TUI."
|
||||
echo "Kernel logs: Alt+F2 | Extra shell: Alt+F3"
|
||||
fi
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
[Journal]
|
||||
ForwardToConsole=yes
|
||||
TTYPath=/dev/ttyS0
|
||||
MaxLevelConsole=debug
|
||||
MaxLevelConsole=info
|
||||
|
||||
Reference in New Issue
Block a user