Compare commits

...

11 Commits
v3.10 ... v3.15

26 changed files with 831 additions and 186 deletions

View File

@@ -16,7 +16,7 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
return "", err return "", err
} }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{ return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}}, {name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
job, job,
@@ -24,6 +24,17 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
}, logFunc) }, logFunc)
} }
func nvidiaStressArchivePrefix(loader string) string {
switch strings.TrimSpace(strings.ToLower(loader)) {
case NvidiaStressLoaderJohn:
return "gpu-nvidia-john"
case NvidiaStressLoaderNCCL:
return "gpu-nvidia-nccl"
default:
return "gpu-nvidia-burn"
}
}
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) { func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil { if err != nil {

View File

@@ -10,9 +10,11 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"runtime"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
"syscall"
"time" "time"
) )
@@ -374,10 +376,17 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
return nil, fmt.Errorf("stressapptest not found: %w", err) return nil, fmt.Errorf("stressapptest not found: %w", err)
} }
// Use a very long duration; the context timeout will kill it at the right time. // Use a very long duration; the context timeout will kill it at the right time.
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test") cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
if threads := platformStressCPUThreads(); threads > 0 {
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
}
if mb := platformStressMemoryMB(); mb > 0 {
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
}
cmd := exec.CommandContext(ctx, path, cmdArgs...)
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
if err := cmd.Start(); err != nil { if err := startLowPriorityCmd(cmd, 15); err != nil {
return nil, fmt.Errorf("stressapptest start: %w", err) return nil, fmt.Errorf("stressapptest start: %w", err)
} }
return cmd, nil return cmd, nil
@@ -418,7 +427,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile) cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
_ = cmd.Start() _ = startLowPriorityCmd(cmd, 10)
return cmd return cmd
} }
@@ -433,10 +442,50 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64") cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
_ = cmd.Start() _ = startLowPriorityCmd(cmd, 10)
return cmd return cmd
} }
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
if err := cmd.Start(); err != nil {
return err
}
if cmd.Process != nil {
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
}
return nil
}
func platformStressCPUThreads() int {
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
return n
}
cpus := runtime.NumCPU()
switch {
case cpus <= 2:
return 1
case cpus <= 8:
return cpus - 1
default:
return cpus - 2
}
}
func platformStressMemoryMB() int {
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
return mb
}
free := freeMemBytes()
if free <= 0 {
return 0
}
mb := int((free * 60) / 100 / (1024 * 1024))
if mb < 1024 {
return 1024
}
return mb
}
func packPlatformDir(dir, dest string) error { func packPlatformDir(dir, dest string) error {
f, err := os.Create(dest) f, err := os.Create(dest)
if err != nil { if err != nil {

View File

@@ -0,0 +1,34 @@
package platform
import (
"runtime"
"testing"
)
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
if got := platformStressCPUThreads(); got != 7 {
t.Fatalf("platformStressCPUThreads=%d want 7", got)
}
}
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
got := platformStressCPUThreads()
if got < 1 {
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
}
if got > runtime.NumCPU() {
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
}
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
}
}
func TestPlatformStressMemoryMBOverride(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
if got := platformStressMemoryMB(); got != 8192 {
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
}
}

View File

@@ -684,7 +684,11 @@ func resolveSATCommand(cmd []string) ([]string, error) {
case "rvs": case "rvs":
return resolveRVSCommand(cmd[1:]...) return resolveRVSCommand(cmd[1:]...)
} }
return cmd, nil path, err := satLookPath(cmd[0])
if err != nil {
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
}
return append([]string{path}, cmd[1:]...), nil
} }
func resolveRVSCommand(args ...string) ([]string, error) { func resolveRVSCommand(args ...string) ([]string, error) {

View File

@@ -162,6 +162,25 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
} }
} }
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
t.Parallel()
tests := []struct {
loader string
want string
}{
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
{loader: "", want: "gpu-nvidia-burn"},
}
for _, tt := range tests {
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
}
}
}
func TestEnvIntFallback(t *testing.T) { func TestEnvIntFallback(t *testing.T) {
os.Unsetenv("BEE_MEMTESTER_SIZE_MB") os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 { if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
@@ -237,6 +256,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
} }
} }
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
if file == "stress-ng" {
return "/usr/bin/stress-ng", nil
}
return "", exec.ErrNotFound
}
t.Cleanup(func() { satLookPath = oldLookPath })
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
if err != nil {
t.Fatalf("resolveSATCommand error: %v", err)
}
if len(cmd) != 3 {
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
}
if cmd[0] != "/usr/bin/stress-ng" {
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
}
}
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
return "", exec.ErrNotFound
}
t.Cleanup(func() { satLookPath = oldLookPath })
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
t.Fatalf("error=%q", err)
}
}
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) { func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
tmp := t.TempDir() tmp := t.TempDir()
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi") execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")

View File

@@ -4,9 +4,11 @@ import (
"bufio" "bufio"
"context" "context"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
"os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"regexp" "regexp"
@@ -179,19 +181,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
Profile string `json:"profile"` Profile string `json:"profile"`
DisplayName string `json:"display_name"` DisplayName string `json:"display_name"`
} }
if r.ContentLength > 0 { if r.Body != nil {
_ = json.NewDecoder(r.Body).Decode(&body) if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
} writeError(w, http.StatusBadRequest, "invalid request body")
return
name := taskNames[target]
if body.Profile != "" {
if n, ok := burnNames[target]; ok {
name = n
} }
} }
if name == "" {
name = target name := taskDisplayName(target, body.Profile, body.Loader)
}
t := &Task{ t := &Task{
ID: newJobID("sat-" + target), ID: newJobID("sat-" + target),
Name: name, Name: name,
@@ -667,6 +664,22 @@ func (h *handler) handleAPIInstallStream(w http.ResponseWriter, r *http.Request)
// ── Metrics SSE ─────────────────────────────────────────────────────────────── // ── Metrics SSE ───────────────────────────────────────────────────────────────
func (h *handler) handleAPIMetricsLatest(w http.ResponseWriter, r *http.Request) {
sample, ok := h.latestMetric()
if !ok {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte("{}"))
return
}
b, err := json.Marshal(sample)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write(b)
}
func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) {
if !sseStart(w) { if !sseStart(w) {
return return
@@ -917,8 +930,31 @@ func parseXrandrOutput(out string) []displayInfo {
return infos return infos
} }
func xrandrCommand(args ...string) *exec.Cmd {
cmd := exec.Command("xrandr", args...)
env := append([]string{}, os.Environ()...)
hasDisplay := false
hasXAuthority := false
for _, kv := range env {
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
hasDisplay = true
}
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
hasXAuthority = true
}
}
if !hasDisplay {
env = append(env, "DISPLAY=:0")
}
if !hasXAuthority {
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
}
cmd.Env = env
return cmd
}
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) { func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
out, err := exec.Command("xrandr").Output() out, err := xrandrCommand().Output()
if err != nil { if err != nil {
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error()) writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
return return
@@ -945,7 +981,7 @@ func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
writeError(w, http.StatusBadRequest, "invalid output name") writeError(w, http.StatusBadRequest, "invalid output name")
return return
} }
if out, err := exec.Command("xrandr", "--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil { if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out))) writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
return return
} }

View File

@@ -0,0 +1,64 @@
package webui
import (
"net/http/httptest"
"strings"
"testing"
"bee/audit/internal/app"
)
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
t.Setenv("DISPLAY", "")
t.Setenv("XAUTHORITY", "")
cmd := xrandrCommand("--query")
var hasDisplay bool
var hasXAuthority bool
for _, kv := range cmd.Env {
if kv == "DISPLAY=:0" {
hasDisplay = true
}
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
hasXAuthority = true
}
}
if !hasDisplay {
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
}
if !hasXAuthority {
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
}
}
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
req.ContentLength = -1
rec := httptest.NewRecorder()
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 1 {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
t.Fatalf("burn profile=%q want smoke", got)
}
}

View File

@@ -289,7 +289,7 @@ func renderAudit() string {
func renderHardwareSummaryCard(opts HandlerOptions) string { func renderHardwareSummaryCard(opts HandlerOptions) string {
data, err := loadSnapshot(opts.AuditPath) data, err := loadSnapshot(opts.AuditPath)
if err != nil { if err != nil {
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-unknown">No audit data</span></div></div>` return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><button class="btn btn-primary" onclick="auditModalRun()">&#9654; Run Audit</button></div></div>`
} }
// Parse just enough fields for the summary banner // Parse just enough fields for the summary banner
var snap struct { var snap struct {
@@ -532,16 +532,10 @@ function refreshCharts() {
} }
setInterval(refreshCharts, 3000); setInterval(refreshCharts, 3000);
const es = new EventSource('/api/metrics/stream'); fetch('/api/metrics/latest').then(r => r.json()).then(d => {
es.addEventListener('metrics', e => {
const d = JSON.parse(e.data);
// Show/hide Fan RPM card based on data availability
const fanCard = document.getElementById('card-server-fans'); const fanCard = document.getElementById('card-server-fans');
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none'; if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
}).catch(() => {});
});
es.onerror = () => {};
</script>` </script>`
} }

View File

@@ -270,6 +270,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export // Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream) mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG) mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG)
mux.HandleFunc("GET /api/metrics/export.csv", h.handleAPIMetricsExportCSV) mux.HandleFunc("GET /api/metrics/export.csv", h.handleAPIMetricsExportCSV)
@@ -1230,13 +1231,6 @@ probe();
func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) { func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
page := strings.TrimPrefix(r.URL.Path, "/") page := strings.TrimPrefix(r.URL.Path, "/")
if page == "" { if page == "" {
// Serve loading page until audit snapshot exists
if _, err := os.Stat(h.opts.AuditPath); err != nil {
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "text/html; charset=utf-8")
_, _ = w.Write([]byte(loadingPageHTML))
return
}
page = "dashboard" page = "dashboard"
} }
// Redirect old routes to new names // Redirect old routes to new names

View File

@@ -136,6 +136,33 @@ func TestRootRendersDashboard(t *testing.T) {
} }
} }
func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
dir := t.TempDir()
exportDir := filepath.Join(dir, "export")
if err := os.MkdirAll(exportDir, 0755); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{
Title: "Bee Hardware Audit",
AuditPath: filepath.Join(dir, "missing-audit.json"),
ExportDir: exportDir,
})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
if !strings.Contains(body, `Run Audit`) {
t.Fatalf("dashboard missing run audit button: %s", body)
}
if strings.Contains(body, `No audit data`) {
t.Fatalf("dashboard still shows empty audit badge: %s", body)
}
}
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) { func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
path := filepath.Join(dir, "audit.json") path := filepath.Join(dir, "audit.json")

View File

@@ -8,6 +8,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"sort" "sort"
"strings"
"sync" "sync"
"time" "time"
@@ -51,6 +52,33 @@ var burnNames = map[string]string{
"amd": "AMD GPU Burn-in", "amd": "AMD GPU Burn-in",
} }
func nvidiaStressTaskName(loader string) string {
switch strings.TrimSpace(strings.ToLower(loader)) {
case platform.NvidiaStressLoaderJohn:
return "NVIDIA GPU Stress (John/OpenCL)"
case platform.NvidiaStressLoaderNCCL:
return "NVIDIA GPU Stress (NCCL)"
default:
return "NVIDIA GPU Stress (bee-gpu-burn)"
}
}
func taskDisplayName(target, profile, loader string) string {
name := taskNames[target]
if profile != "" {
if n, ok := burnNames[target]; ok {
name = n
}
}
if target == "nvidia-stress" {
name = nvidiaStressTaskName(loader)
}
if name == "" {
name = target
}
return name
}
// Task represents one unit of work in the queue. // Task represents one unit of work in the queue.
type Task struct { type Task struct {
ID string `json:"id"` ID string `json:"id"`
@@ -440,6 +468,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if dur <= 0 { if dur <= 0 {
dur = 60 dur = 60
} }
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append) archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
case "amd": case "amd":
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append) archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)

View File

@@ -95,6 +95,23 @@ func TestResolveBurnPreset(t *testing.T) {
} }
} }
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
tests := []struct {
loader string
want string
}{
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
}
for _, tc := range tests {
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
}
}
}
func TestRunTaskHonorsCancel(t *testing.T) { func TestRunTaskHonorsCancel(t *testing.T) {
t.Parallel() t.Parallel()
@@ -154,3 +171,34 @@ func TestRunTaskHonorsCancel(t *testing.T) {
t.Fatal("runTask did not return after cancel") t.Fatal("runTask did not return after cancel")
} }
} }
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
t.Parallel()
var gotDuration int
q := &taskQueue{
opts: &HandlerOptions{App: &app.App{}},
}
tk := &Task{
ID: "cpu-burn-1",
Name: "CPU Burn-in",
Target: "cpu",
Status: TaskRunning,
CreatedAt: time.Now(),
params: taskParams{BurnProfile: "smoke"},
}
j := &jobState{}
orig := runCPUAcceptancePackCtx
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
gotDuration = durationSec
return "/tmp/cpu-burn.tar.gz", nil
}
defer func() { runCPUAcceptancePackCtx = orig }()
q.runTask(tk, j, context.Background())
if gotDuration != 5*60 {
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
}
}

2
bible

Submodule bible updated: 688b87e98d...456c1f022c

View File

@@ -13,9 +13,10 @@ Use one of:
This applies to: This applies to:
- `iso/builder/config/package-lists/*.list.chroot` - `iso/builder/config/package-lists/*.list.chroot`
- Any package referenced in `grub.cfg`, hooks, or overlay scripts (e.g. file paths like `/boot/memtest86+x64.bin`) - Any package referenced in bootloader configs, hooks, or overlay scripts
## Example of what goes wrong without this ## Memtest rule
`memtest86+` in Debian bookworm installs `/boot/memtest86+x64.bin`, not `/boot/memtest86+.bin`. Prefer live-build's built-in memtest integration over custom hooks or hardcoded
Guessing the filename caused a broken GRUB entry that only surfaced at boot time, after a full rebuild. bootloader paths. If you ever need to reference memtest files manually, verify
the exact package file list first for the target Debian release.

View File

@@ -29,7 +29,7 @@ lb config noauto \
--security true \ --security true \
--linux-flavours "amd64" \ --linux-flavours "amd64" \
--linux-packages "${LB_LINUX_PACKAGES}" \ --linux-packages "${LB_LINUX_PACKAGES}" \
--memtest none \ --memtest memtest86+ \
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \

View File

@@ -36,6 +36,7 @@ typedef void *CUstream;
#define MAX_CUBLAS_PROFILES 5 #define MAX_CUBLAS_PROFILES 5
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u) #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u) #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
#define STRESS_LAUNCH_DEPTH 8
static const char *ptx_source = static const char *ptx_source =
".version 6.0\n" ".version 6.0\n"
@@ -422,24 +423,31 @@ static int run_ptx_fallback(struct cuda_api *api,
double deadline = start + (double)seconds; double deadline = start + (double)seconds;
while (now_seconds() < deadline) { while (now_seconds() < deadline) {
launches_per_wave = 0; launches_per_wave = 0;
for (int lane = 0; lane < stream_count; lane++) { for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads); int launched_this_batch = 0;
if (!check_rc(api, for (int lane = 0; lane < stream_count; lane++) {
"cuLaunchKernel", unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
api->cuLaunchKernel(kernel, if (!check_rc(api,
blocks, "cuLaunchKernel",
1, api->cuLaunchKernel(kernel,
1, blocks,
threads, 1,
1, 1,
1, threads,
0, 1,
streams[lane], 1,
params[lane], 0,
NULL))) { streams[lane],
goto fail; params[lane],
NULL))) {
goto fail;
}
launches_per_wave++;
launched_this_batch++;
}
if (launched_this_batch <= 0) {
break;
} }
launches_per_wave++;
} }
if (launches_per_wave <= 0) { if (launches_per_wave <= 0) {
goto fail; goto fail;
@@ -460,10 +468,11 @@ static int run_ptx_fallback(struct cuda_api *api,
report->iterations = iterations; report->iterations = iterations;
snprintf(report->details, snprintf(report->details,
sizeof(report->details), sizeof(report->details),
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n", "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
size_mb, size_mb,
report->buffer_mb, report->buffer_mb,
report->stream_count, report->stream_count,
STRESS_LAUNCH_DEPTH,
bytes_per_stream[0] / (1024u * 1024u), bytes_per_stream[0] / (1024u * 1024u),
iterations); iterations);
@@ -1184,10 +1193,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
report->buffer_mb = (int)(total_budget / (1024u * 1024u)); report->buffer_mb = (int)(total_budget / (1024u * 1024u));
append_detail(report->details, append_detail(report->details,
sizeof(report->details), sizeof(report->details),
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n", "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
size_mb, size_mb,
report->buffer_mb, report->buffer_mb,
report->stream_count, report->stream_count,
STRESS_LAUNCH_DEPTH,
mp_count, mp_count,
per_profile_budget / (1024u * 1024u)); per_profile_budget / (1024u * 1024u));
@@ -1239,26 +1249,33 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
double deadline = now_seconds() + (double)seconds; double deadline = now_seconds() + (double)seconds;
while (now_seconds() < deadline) { while (now_seconds() < deadline) {
wave_launches = 0; wave_launches = 0;
for (int i = 0; i < prepared_count; i++) { for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
if (!prepared[i].ready) { int launched_this_batch = 0;
continue; for (int i = 0; i < prepared_count; i++) {
} if (!prepared[i].ready) {
if (!run_cublas_profile(handle, &cublas, &prepared[i])) { continue;
append_detail(report->details,
sizeof(report->details),
"%s=FAILED runtime\n",
prepared[i].desc.name);
for (int j = 0; j < prepared_count; j++) {
destroy_profile(&cublas, cuda, &prepared[j]);
} }
cublas.cublasLtDestroy(handle); if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
destroy_streams(cuda, streams, stream_count); append_detail(report->details,
cuda->cuCtxDestroy(ctx); sizeof(report->details),
return 0; "%s=FAILED runtime\n",
prepared[i].desc.name);
for (int j = 0; j < prepared_count; j++) {
destroy_profile(&cublas, cuda, &prepared[j]);
}
cublas.cublasLtDestroy(handle);
destroy_streams(cuda, streams, stream_count);
cuda->cuCtxDestroy(ctx);
return 0;
}
prepared[i].iterations++;
report->iterations++;
wave_launches++;
launched_this_batch++;
}
if (launched_this_batch <= 0) {
break;
} }
prepared[i].iterations++;
report->iterations++;
wave_launches++;
} }
if (wave_launches <= 0) { if (wave_launches <= 0) {
break; break;

View File

@@ -111,8 +111,231 @@ resolve_iso_version() {
resolve_audit_version resolve_audit_version
} }
iso_list_files() {
iso_path="$1"
if command -v bsdtar >/dev/null 2>&1; then
bsdtar -tf "$iso_path"
return $?
fi
if command -v xorriso >/dev/null 2>&1; then
xorriso -indev "$iso_path" -find / -type f -print 2>/dev/null | sed 's#^/##'
return $?
fi
return 127
}
iso_extract_file() {
iso_path="$1"
iso_member="$2"
if command -v bsdtar >/dev/null 2>&1; then
bsdtar -xOf "$iso_path" "$iso_member"
return $?
fi
if command -v xorriso >/dev/null 2>&1; then
xorriso -osirrox on -indev "$iso_path" -cat "/$iso_member" 2>/dev/null
return $?
fi
return 127
}
require_iso_reader() {
command -v bsdtar >/dev/null 2>&1 && return 0
command -v xorriso >/dev/null 2>&1 && return 0
memtest_fail "ISO reader is required for validation/debug (expected bsdtar or xorriso)" "${1:-}"
}
dump_memtest_debug() {
phase="$1"
lb_dir="${2:-}"
iso_path="${3:-}"
phase_slug="$(printf '%s' "${phase}" | tr ' /' '__')"
memtest_log="${LOG_DIR:-}/memtest-${phase_slug}.log"
(
echo "=== memtest debug: ${phase} ==="
echo "-- auto/config --"
if [ -f "${BUILDER_DIR}/auto/config" ]; then
grep -n -- '--memtest' "${BUILDER_DIR}/auto/config" || echo " (no --memtest line found)"
else
echo " (missing ${BUILDER_DIR}/auto/config)"
fi
echo "-- source bootloader templates --"
for cfg in \
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
if [ -f "$cfg" ]; then
echo " file: $cfg"
grep -n 'Memory Test\|memtest' "$cfg" || echo " (no memtest lines)"
fi
done
if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
echo "-- live-build workdir package lists --"
for pkg in \
"$lb_dir/config/package-lists/bee.list.chroot" \
"$lb_dir/config/package-lists/bee-gpu.list.chroot" \
"$lb_dir/config/package-lists/bee-nvidia.list.chroot"; do
if [ -f "$pkg" ]; then
echo " file: $pkg"
grep -n 'memtest' "$pkg" || echo " (no memtest lines)"
fi
done
echo "-- live-build chroot/boot --"
if [ -d "$lb_dir/chroot/boot" ]; then
find "$lb_dir/chroot/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/chroot/boot)"
fi
echo "-- live-build binary/boot --"
if [ -d "$lb_dir/binary/boot" ]; then
find "$lb_dir/binary/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/binary/boot)"
fi
echo "-- live-build package cache --"
if [ -d "$lb_dir/cache/packages.chroot" ]; then
find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/cache/packages.chroot)"
fi
fi
if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
echo "-- ISO memtest files --"
iso_list_files "$iso_path" | grep 'memtest' | sed 's/^/ /' || echo " (no memtest files in ISO)"
echo "-- ISO GRUB memtest lines --"
iso_extract_file "$iso_path" boot/grub/grub.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in boot/grub/grub.cfg)"
echo "-- ISO isolinux memtest lines --"
iso_extract_file "$iso_path" isolinux/live.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in isolinux/live.cfg)"
fi
echo "=== end memtest debug: ${phase} ==="
) | {
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ]; then
tee "${memtest_log}"
else
cat
fi
}
}
memtest_fail() {
msg="$1"
iso_path="${2:-}"
echo "ERROR: ${msg}" >&2
dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
exit 1
}
validate_iso_memtest() {
iso_path="$1"
echo "=== validating memtest in ISO ==="
[ -f "$iso_path" ] || memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
require_iso_reader "$iso_path"
iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.bin$' || {
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
}
iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.efi$' || {
memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
}
grub_cfg="$(mktemp)"
isolinux_cfg="$(mktemp)"
iso_extract_file "$iso_path" boot/grub/grub.cfg > "$grub_cfg" || memtest_fail "failed to extract boot/grub/grub.cfg from ISO" "$iso_path"
iso_extract_file "$iso_path" isolinux/live.cfg > "$isolinux_cfg" || memtest_fail "failed to extract isolinux/live.cfg from ISO" "$iso_path"
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
}
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
memtest_fail "isolinux memtest path is missing" "$iso_path"
}
rm -f "$grub_cfg" "$isolinux_cfg"
echo "=== memtest validation OK ==="
}
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)" AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)" ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
LOG_DIR="${DIST_DIR}/${ISO_BASENAME}.logs"
LOG_ARCHIVE="${DIST_DIR}/${ISO_BASENAME}.logs.tar.gz"
ISO_OUT="${DIST_DIR}/${ISO_BASENAME}.iso"
LOG_OUT="${LOG_DIR}/build.log"
cleanup_build_log() {
status="${1:-$?}"
trap - EXIT INT TERM HUP
if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
BUILD_LOG_ACTIVE=0
exec 1>&3 2>&4
exec 3>&- 4>&-
if [ -n "${BUILD_TEE_PID:-}" ]; then
wait "${BUILD_TEE_PID}" 2>/dev/null || true
fi
rm -f "${BUILD_LOG_PIPE}"
fi
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
rm -f "${LOG_ARCHIVE}"
tar -czf "${LOG_ARCHIVE}" -C "${DIST_DIR}" "$(basename "${LOG_DIR}")" 2>/dev/null || true
fi
exit "${status}"
}
start_build_log() {
command -v tee >/dev/null 2>&1 || {
echo "ERROR: tee is required for build logging" >&2
exit 1
}
rm -rf "${LOG_DIR}"
rm -f "${LOG_ARCHIVE}"
mkdir -p "${LOG_DIR}"
BUILD_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-build-log.XXXXXX")"
mkfifo "${BUILD_LOG_PIPE}"
exec 3>&1 4>&2
tee "${LOG_OUT}" < "${BUILD_LOG_PIPE}" &
BUILD_TEE_PID=$!
exec > "${BUILD_LOG_PIPE}" 2>&1
BUILD_LOG_ACTIVE=1
trap 'cleanup_build_log "$?"' EXIT INT TERM HUP
echo "=== build log dir: ${LOG_DIR} ==="
echo "=== build log: ${LOG_OUT} ==="
echo "=== build log archive: ${LOG_ARCHIVE} ==="
}
start_build_log
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency. # Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
# If headers for the detected ABI are not yet installed (kernel updated since image build), # If headers for the detected ABI are not yet installed (kernel updated since image build),
@@ -245,13 +468,13 @@ rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-release" \ "${OVERLAY_STAGE_DIR}/etc/bee-release" \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \ "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \ "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
rm -rf \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
# Remove NVIDIA-specific overlay files for non-nvidia variants # Remove NVIDIA-specific overlay files for non-nvidia variants
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
@@ -304,7 +527,6 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true
ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
fi fi
# --- inject smoketest into overlay so it runs directly on the live CD --- # --- inject smoketest into overlay so it runs directly on the live CD ---
@@ -510,6 +732,7 @@ export BEE_GPU_VENDOR_UPPER
cd "${LB_DIR}" cd "${LB_DIR}"
lb clean 2>&1 | tail -3 lb clean 2>&1 | tail -3
lb config 2>&1 | tail -5 lb config 2>&1 | tail -5
dump_memtest_debug "pre-build" "${LB_DIR}"
lb build 2>&1 lb build 2>&1
# --- persist deb package cache back to shared location --- # --- persist deb package cache back to shared location ---
@@ -521,8 +744,9 @@ fi
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR # live-build outputs live-image-amd64.hybrid.iso in LB_DIR
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso" ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
if [ -f "$ISO_RAW" ]; then if [ -f "$ISO_RAW" ]; then
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
validate_iso_memtest "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT" cp "$ISO_RAW" "$ISO_OUT"
echo "" echo ""
echo "=== done (${BEE_GPU_VENDOR}) ===" echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -22,3 +22,7 @@ label live-@FLAVOUR@-failsafe
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
label memtest
menu label ^Memory Test (memtest86+)
linux /boot/memtest86+x64.bin

View File

@@ -1,76 +0,0 @@
#!/bin/sh
# Copy memtest86+ binaries from chroot /boot into the ISO boot directory
# so GRUB can chainload them directly (they must be on the ISO filesystem,
# not inside the squashfs).
#
# Primary: copy from chroot/boot/ (populated by package postinst).
# Naming fallbacks:
# Debian Bookworm: /boot/memtest86+ — EFI PE64 (no extension)
# /boot/memtest86+.bin — legacy binary
# Upstream/Ubuntu: /boot/memtest86+x64.efi, /boot/memtest86+x64.bin, etc.
# Last resort: extract directly from the cached .deb if postinst didn't place
# the files (happens in chroot environments without grub triggers).
set -e
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi"
# Ensure destination directory exists (absence caused silent copy failures).
mkdir -p binary/boot
echo "memtest: scanning chroot/boot/ for memtest files:"
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files in chroot/boot/"
# Primary path: copy upstream-named files from chroot/boot/
for f in ${MEMTEST_FILES}; do
src="chroot/boot/${f}"
if [ -f "${src}" ]; then
cp "${src}" "binary/boot/${f}"
echo "memtest: copied ${f} from chroot/boot/"
fi
done
# Debian Bookworm naming fallback: /boot/memtest86+ (no extension) is the EFI binary.
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "chroot/boot/memtest86+" ]; then
cp "chroot/boot/memtest86+" "binary/boot/memtest86+x64.efi"
echo "memtest: copied /boot/memtest86+ as memtest86+x64.efi (Debian naming)"
fi
if [ ! -f "binary/boot/memtest86+x64.bin" ] && [ -f "chroot/boot/memtest86+.bin" ]; then
cp "chroot/boot/memtest86+.bin" "binary/boot/memtest86+x64.bin"
echo "memtest: copied /boot/memtest86+.bin as memtest86+x64.bin (Debian naming)"
fi
# Last resort: if EFI binary still missing, extract from cached .deb
if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
echo "memtest: EFI binary missing — attempting extraction from .deb cache"
deb=$(find chroot/var/cache/apt/archives/ chroot/var/lib/apt/lists/ \
-name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null \
| head -1)
if [ -z "$deb" ]; then
deb=$(find cache/ -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null | head -1)
fi
if [ -n "$deb" ]; then
echo "memtest: extracting from ${deb}"
EXTRACT_DIR="$(mktemp -d)"
dpkg-deb -x "${deb}" "${EXTRACT_DIR}"
echo "memtest: files found in .deb:"
find "${EXTRACT_DIR}/boot" -type f 2>/dev/null || echo " (none in /boot)"
for f in ${MEMTEST_FILES}; do
src="${EXTRACT_DIR}/boot/${f}"
if [ -f "${src}" ]; then
cp "${src}" "binary/boot/${f}"
echo "memtest: extracted ${f} from .deb"
fi
done
# Debian naming fallback inside .deb as well
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "${EXTRACT_DIR}/boot/memtest86+" ]; then
cp "${EXTRACT_DIR}/boot/memtest86+" "binary/boot/memtest86+x64.efi"
echo "memtest: extracted /boot/memtest86+ as memtest86+x64.efi from .deb"
fi
rm -rf "${EXTRACT_DIR}"
else
echo "memtest: WARNING: no memtest86+ .deb found in cache — memtest will not be available"
fi
fi
echo "memtest: binary/boot/ contents:"
ls binary/boot/memtest* 2>/dev/null || echo " (none)"

View File

@@ -21,14 +21,15 @@ openssh-server
# Disk installer # Disk installer
squashfs-tools squashfs-tools
parted parted
# grub-pc / grub-efi-amd64 provide grub-install + grub2-common (required for chroot install). # Keep GRUB install tools without selecting a single active platform package.
# The -bin variants only carry binary modules and do NOT include grub-install itself. # grub-pc and grub-efi-amd64 conflict with each other, but grub2-common
grub-pc # provides grub-install/update-grub and the *-bin packages provide BIOS/UEFI modules.
grub2-common
grub-pc-bin grub-pc-bin
grub-efi-amd64
grub-efi-amd64-bin grub-efi-amd64-bin
grub-efi-amd64-signed grub-efi-amd64-signed
shim-signed shim-signed
efibootmgr
# Filesystem support for USB export targets # Filesystem support for USB export targets
exfatprogs exfatprogs
@@ -50,7 +51,6 @@ sudo
zstd zstd
mstflint mstflint
memtester memtester
memtest86+
stress-ng stress-ng
stressapptest stressapptest

View File

@@ -1,25 +1,9 @@
[Unit] [Unit]
Description=Bee: schedule startup hardware audit via task queue Description=Bee: on-demand hardware audit (not started automatically)
# Start AFTER bee-web, not before — bee-web must not wait for audit.
After=bee-web.service
Wants=bee-web.service
[Service] [Service]
Type=oneshot Type=oneshot
RemainAfterExit=yes RemainAfterExit=yes
# Wait up to 90s for bee-web to respond on /healthz, then sleep 60s for ExecStart=/bin/sh -c 'curl -sf -X POST http://localhost/api/audit/run >/dev/null'
# the system to settle (GPU drivers, sensors), then enqueue the audit as
# a background task so it appears in the task list and logs.
ExecStart=/bin/sh -c '\
i=0; \
while [ $i -lt 90 ]; do \
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi; \
sleep 1; i=$((i+1)); \
done; \
sleep 60; \
curl -sf -X POST http://localhost/api/audit/run >/dev/null'
StandardOutput=journal StandardOutput=journal
StandardError=journal StandardError=journal
[Install]
WantedBy=multi-user.target

View File

@@ -12,17 +12,55 @@
set -euo pipefail set -euo pipefail
usage() {
cat >&2 <<'EOF'
Usage: bee-install <device> [logfile]
Installs the live system to a local disk (WIPES the target).
device Target block device, e.g. /dev/sda or /dev/nvme0n1
Must be a hard disk or NVMe — NOT a CD-ROM (/dev/sr*)
logfile Optional path for progress log (default: /tmp/bee-install.log)
Examples:
bee-install /dev/sda
bee-install /dev/nvme0n1
bee-install /dev/sdb /tmp/my-install.log
WARNING: ALL DATA ON <device> WILL BE ERASED.
Layout (UEFI): GPT — partition 1: EFI 512MB vfat, partition 2: root ext4
Layout (BIOS): MBR — partition 1: root ext4
EOF
exit 1
}
DEVICE="${1:-}" DEVICE="${1:-}"
LOGFILE="${2:-/tmp/bee-install.log}" LOGFILE="${2:-/tmp/bee-install.log}"
if [ -z "$DEVICE" ]; then if [ -z "$DEVICE" ] || [ "$DEVICE" = "--help" ] || [ "$DEVICE" = "-h" ]; then
echo "Usage: bee-install <device> [logfile]" >&2 usage
exit 1
fi fi
if [ ! -b "$DEVICE" ]; then if [ ! -b "$DEVICE" ]; then
echo "ERROR: $DEVICE is not a block device" >&2 echo "ERROR: $DEVICE is not a block device" >&2
echo "Run 'lsblk' to list available disks." >&2
exit 1 exit 1
fi fi
# Block CD-ROM devices
case "$DEVICE" in
/dev/sr*|/dev/scd*)
echo "ERROR: $DEVICE is a CD-ROM/optical device — cannot install to it." >&2
echo "Run 'lsblk' to find the target disk (e.g. /dev/sda, /dev/nvme0n1)." >&2
exit 1
;;
esac
# Check required tools
for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
if ! command -v "$tool" >/dev/null 2>&1; then
echo "ERROR: required tool not found: $tool" >&2
exit 1
fi
done
SQUASHFS="/run/live/medium/live/filesystem.squashfs" SQUASHFS="/run/live/medium/live/filesystem.squashfs"
if [ ! -f "$SQUASHFS" ]; then if [ ! -f "$SQUASHFS" ]; then

View File

@@ -7,6 +7,8 @@ EXCLUDE=""
FORMAT="" FORMAT=""
JOHN_DIR="/usr/local/lib/bee/john/run" JOHN_DIR="/usr/local/lib/bee/john/run"
JOHN_BIN="${JOHN_DIR}/john" JOHN_BIN="${JOHN_DIR}/john"
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
usage() { usage() {
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2 echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
@@ -23,6 +25,95 @@ contains_csv() {
echo ",${haystack}," | grep -q ",${needle}," echo ",${haystack}," | grep -q ",${needle},"
} }
show_opencl_diagnostics() {
echo "-- OpenCL ICD vendors --" >&2
if [ -d /etc/OpenCL/vendors ]; then
ls -l /etc/OpenCL/vendors >&2 || true
for icd in /etc/OpenCL/vendors/*.icd; do
[ -f "${icd}" ] || continue
echo " file: ${icd}" >&2
sed 's/^/ /' "${icd}" >&2 || true
done
else
echo " /etc/OpenCL/vendors is missing" >&2
fi
echo "-- NVIDIA device nodes --" >&2
ls -l /dev/nvidia* >&2 || true
echo "-- ldconfig OpenCL/NVIDIA --" >&2
ldconfig -p 2>/dev/null | grep 'libOpenCL\|libcuda\|libnvidia-opencl' >&2 || true
if command -v clinfo >/dev/null 2>&1; then
echo "-- clinfo -l --" >&2
clinfo -l >&2 || true
fi
echo "-- john --list=opencl-devices --" >&2
./john --list=opencl-devices >&2 || true
}
refresh_nvidia_runtime() {
if [ "$(id -u)" != "0" ]; then
return 1
fi
if command -v bee-nvidia-load >/dev/null 2>&1; then
bee-nvidia-load >/dev/null 2>&1 || true
fi
ldconfig >/dev/null 2>&1 || true
return 0
}
ensure_nvidia_uvm() {
if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
return 0
fi
if [ "$(id -u)" != "0" ]; then
return 1
fi
ko="/usr/local/lib/nvidia/nvidia-uvm.ko"
[ -f "${ko}" ] || return 1
if ! insmod "${ko}" >/dev/null 2>&1; then
return 1
fi
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices | awk '{print $1}')
if [ -n "${uvm_major}" ]; then
mknod -m 666 /dev/nvidia-uvm c "${uvm_major}" 0 2>/dev/null || true
mknod -m 666 /dev/nvidia-uvm-tools c "${uvm_major}" 1 2>/dev/null || true
fi
return 0
}
ensure_opencl_ready() {
out=$(./john --list=opencl-devices 2>&1 || true)
if echo "${out}" | grep -q "Device #"; then
return 0
fi
if refresh_nvidia_runtime; then
out=$(./john --list=opencl-devices 2>&1 || true)
if echo "${out}" | grep -q "Device #"; then
return 0
fi
fi
if ensure_nvidia_uvm; then
out=$(./john --list=opencl-devices 2>&1 || true)
if echo "${out}" | grep -q "Device #"; then
return 0
fi
fi
echo "OpenCL devices are not available for John." >&2
if ! lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
echo "nvidia_uvm is not loaded." >&2
fi
if [ ! -e /dev/nvidia-uvm ]; then
echo "/dev/nvidia-uvm is missing." >&2
fi
show_opencl_diagnostics
return 1
}
while [ "$#" -gt 0 ]; do while [ "$#" -gt 0 ]; do
case "$1" in case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;; --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
@@ -76,6 +167,8 @@ echo "john_devices=${JOHN_DEVICES}"
cd "${JOHN_DIR}" cd "${JOHN_DIR}"
ensure_opencl_ready || exit 1
choose_format() { choose_format() {
if [ -n "${FORMAT}" ]; then if [ -n "${FORMAT}" ]; then
echo "${FORMAT}" echo "${FORMAT}"

View File

@@ -17,7 +17,7 @@ mkdir -p "$(dirname "$log_file")"
serial_sink() { serial_sink() {
local tty="$1" local tty="$1"
if [ -w "$tty" ]; then if [ -w "$tty" ]; then
cat > "$tty" cat > "$tty" 2>/dev/null || true
else else
cat > /dev/null cat > /dev/null
fi fi

View File

@@ -59,11 +59,24 @@ load_module() {
return 1 return 1
} }
load_host_module() {
mod="$1"
if modprobe "$mod" >/dev/null 2>&1; then
log "host module loaded: $mod"
return 0
fi
return 1
}
case "$nvidia_mode" in case "$nvidia_mode" in
normal|full) normal|full)
if ! load_module nvidia; then if ! load_module nvidia; then
exit 1 exit 1
fi fi
# nvidia-modeset on some server kernels needs ACPI video helper symbols
# exported by the generic "video" module. Best-effort only; compute paths
# remain functional even if display-related modules stay absent.
load_host_module video || true
load_module nvidia-modeset || true load_module nvidia-modeset || true
load_module nvidia-uvm || true load_module nvidia-uvm || true
;; ;;