Compare commits

...

11 Commits
v3.10 ... v3.15

26 changed files with 831 additions and 186 deletions

View File

@@ -16,7 +16,7 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
job,
@@ -24,6 +24,17 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
}, logFunc)
}
func nvidiaStressArchivePrefix(loader string) string {
switch strings.TrimSpace(strings.ToLower(loader)) {
case NvidiaStressLoaderJohn:
return "gpu-nvidia-john"
case NvidiaStressLoaderNCCL:
return "gpu-nvidia-nccl"
default:
return "gpu-nvidia-burn"
}
}
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil {

View File

@@ -10,9 +10,11 @@ import (
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
"time"
)
@@ -374,10 +376,17 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
return nil, fmt.Errorf("stressapptest not found: %w", err)
}
// Use a very long duration; the context timeout will kill it at the right time.
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
if threads := platformStressCPUThreads(); threads > 0 {
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
}
if mb := platformStressMemoryMB(); mb > 0 {
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
}
cmd := exec.CommandContext(ctx, path, cmdArgs...)
cmd.Stdout = nil
cmd.Stderr = nil
if err := cmd.Start(); err != nil {
if err := startLowPriorityCmd(cmd, 15); err != nil {
return nil, fmt.Errorf("stressapptest start: %w", err)
}
return cmd, nil
@@ -418,7 +427,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
cmd.Stdout = nil
cmd.Stderr = nil
_ = cmd.Start()
_ = startLowPriorityCmd(cmd, 10)
return cmd
}
@@ -433,10 +442,50 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
cmd.Stdout = nil
cmd.Stderr = nil
_ = cmd.Start()
_ = startLowPriorityCmd(cmd, 10)
return cmd
}
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
if err := cmd.Start(); err != nil {
return err
}
if cmd.Process != nil {
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
}
return nil
}
func platformStressCPUThreads() int {
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
return n
}
cpus := runtime.NumCPU()
switch {
case cpus <= 2:
return 1
case cpus <= 8:
return cpus - 1
default:
return cpus - 2
}
}
func platformStressMemoryMB() int {
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
return mb
}
free := freeMemBytes()
if free <= 0 {
return 0
}
mb := int((free * 60) / 100 / (1024 * 1024))
if mb < 1024 {
return 1024
}
return mb
}
func packPlatformDir(dir, dest string) error {
f, err := os.Create(dest)
if err != nil {

View File

@@ -0,0 +1,34 @@
package platform
import (
"runtime"
"testing"
)
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
if got := platformStressCPUThreads(); got != 7 {
t.Fatalf("platformStressCPUThreads=%d want 7", got)
}
}
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
got := platformStressCPUThreads()
if got < 1 {
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
}
if got > runtime.NumCPU() {
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
}
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
}
}
func TestPlatformStressMemoryMBOverride(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
if got := platformStressMemoryMB(); got != 8192 {
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
}
}

View File

@@ -684,7 +684,11 @@ func resolveSATCommand(cmd []string) ([]string, error) {
case "rvs":
return resolveRVSCommand(cmd[1:]...)
}
return cmd, nil
path, err := satLookPath(cmd[0])
if err != nil {
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
}
return append([]string{path}, cmd[1:]...), nil
}
func resolveRVSCommand(args ...string) ([]string, error) {

View File

@@ -162,6 +162,25 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
}
}
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
t.Parallel()
tests := []struct {
loader string
want string
}{
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
{loader: "", want: "gpu-nvidia-burn"},
}
for _, tt := range tests {
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
}
}
}
func TestEnvIntFallback(t *testing.T) {
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
@@ -237,6 +256,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
}
}
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
if file == "stress-ng" {
return "/usr/bin/stress-ng", nil
}
return "", exec.ErrNotFound
}
t.Cleanup(func() { satLookPath = oldLookPath })
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
if err != nil {
t.Fatalf("resolveSATCommand error: %v", err)
}
if len(cmd) != 3 {
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
}
if cmd[0] != "/usr/bin/stress-ng" {
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
}
}
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
return "", exec.ErrNotFound
}
t.Cleanup(func() { satLookPath = oldLookPath })
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
t.Fatalf("error=%q", err)
}
}
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
tmp := t.TempDir()
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")

View File

@@ -4,9 +4,11 @@ import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"regexp"
@@ -179,19 +181,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
Profile string `json:"profile"`
DisplayName string `json:"display_name"`
}
if r.ContentLength > 0 {
_ = json.NewDecoder(r.Body).Decode(&body)
}
name := taskNames[target]
if body.Profile != "" {
if n, ok := burnNames[target]; ok {
name = n
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
}
if name == "" {
name = target
}
name := taskDisplayName(target, body.Profile, body.Loader)
t := &Task{
ID: newJobID("sat-" + target),
Name: name,
@@ -667,6 +664,22 @@ func (h *handler) handleAPIInstallStream(w http.ResponseWriter, r *http.Request)
// ── Metrics SSE ───────────────────────────────────────────────────────────────
func (h *handler) handleAPIMetricsLatest(w http.ResponseWriter, r *http.Request) {
sample, ok := h.latestMetric()
if !ok {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte("{}"))
return
}
b, err := json.Marshal(sample)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write(b)
}
func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) {
if !sseStart(w) {
return
@@ -917,8 +930,31 @@ func parseXrandrOutput(out string) []displayInfo {
return infos
}
func xrandrCommand(args ...string) *exec.Cmd {
cmd := exec.Command("xrandr", args...)
env := append([]string{}, os.Environ()...)
hasDisplay := false
hasXAuthority := false
for _, kv := range env {
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
hasDisplay = true
}
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
hasXAuthority = true
}
}
if !hasDisplay {
env = append(env, "DISPLAY=:0")
}
if !hasXAuthority {
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
}
cmd.Env = env
return cmd
}
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
out, err := exec.Command("xrandr").Output()
out, err := xrandrCommand().Output()
if err != nil {
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
return
@@ -945,7 +981,7 @@ func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
writeError(w, http.StatusBadRequest, "invalid output name")
return
}
if out, err := exec.Command("xrandr", "--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
return
}

View File

@@ -0,0 +1,64 @@
package webui
import (
"net/http/httptest"
"strings"
"testing"
"bee/audit/internal/app"
)
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
t.Setenv("DISPLAY", "")
t.Setenv("XAUTHORITY", "")
cmd := xrandrCommand("--query")
var hasDisplay bool
var hasXAuthority bool
for _, kv := range cmd.Env {
if kv == "DISPLAY=:0" {
hasDisplay = true
}
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
hasXAuthority = true
}
}
if !hasDisplay {
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
}
if !hasXAuthority {
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
}
}
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
req.ContentLength = -1
rec := httptest.NewRecorder()
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 1 {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
t.Fatalf("burn profile=%q want smoke", got)
}
}

View File

@@ -289,7 +289,7 @@ func renderAudit() string {
func renderHardwareSummaryCard(opts HandlerOptions) string {
data, err := loadSnapshot(opts.AuditPath)
if err != nil {
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-unknown">No audit data</span></div></div>`
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><button class="btn btn-primary" onclick="auditModalRun()">&#9654; Run Audit</button></div></div>`
}
// Parse just enough fields for the summary banner
var snap struct {
@@ -532,16 +532,10 @@ function refreshCharts() {
}
setInterval(refreshCharts, 3000);
const es = new EventSource('/api/metrics/stream');
es.addEventListener('metrics', e => {
const d = JSON.parse(e.data);
// Show/hide Fan RPM card based on data availability
fetch('/api/metrics/latest').then(r => r.json()).then(d => {
const fanCard = document.getElementById('card-server-fans');
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
});
es.onerror = () => {};
}).catch(() => {});
</script>`
}

View File

@@ -270,6 +270,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG)
mux.HandleFunc("GET /api/metrics/export.csv", h.handleAPIMetricsExportCSV)
@@ -1230,13 +1231,6 @@ probe();
func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
page := strings.TrimPrefix(r.URL.Path, "/")
if page == "" {
// Serve loading page until audit snapshot exists
if _, err := os.Stat(h.opts.AuditPath); err != nil {
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "text/html; charset=utf-8")
_, _ = w.Write([]byte(loadingPageHTML))
return
}
page = "dashboard"
}
// Redirect old routes to new names

View File

@@ -136,6 +136,33 @@ func TestRootRendersDashboard(t *testing.T) {
}
}
func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
dir := t.TempDir()
exportDir := filepath.Join(dir, "export")
if err := os.MkdirAll(exportDir, 0755); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{
Title: "Bee Hardware Audit",
AuditPath: filepath.Join(dir, "missing-audit.json"),
ExportDir: exportDir,
})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
if !strings.Contains(body, `Run Audit`) {
t.Fatalf("dashboard missing run audit button: %s", body)
}
if strings.Contains(body, `No audit data`) {
t.Fatalf("dashboard still shows empty audit badge: %s", body)
}
}
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")

View File

@@ -8,6 +8,7 @@ import (
"os"
"path/filepath"
"sort"
"strings"
"sync"
"time"
@@ -51,6 +52,33 @@ var burnNames = map[string]string{
"amd": "AMD GPU Burn-in",
}
func nvidiaStressTaskName(loader string) string {
switch strings.TrimSpace(strings.ToLower(loader)) {
case platform.NvidiaStressLoaderJohn:
return "NVIDIA GPU Stress (John/OpenCL)"
case platform.NvidiaStressLoaderNCCL:
return "NVIDIA GPU Stress (NCCL)"
default:
return "NVIDIA GPU Stress (bee-gpu-burn)"
}
}
func taskDisplayName(target, profile, loader string) string {
name := taskNames[target]
if profile != "" {
if n, ok := burnNames[target]; ok {
name = n
}
}
if target == "nvidia-stress" {
name = nvidiaStressTaskName(loader)
}
if name == "" {
name = target
}
return name
}
// Task represents one unit of work in the queue.
type Task struct {
ID string `json:"id"`
@@ -440,6 +468,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if dur <= 0 {
dur = 60
}
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
case "amd":
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)

View File

@@ -95,6 +95,23 @@ func TestResolveBurnPreset(t *testing.T) {
}
}
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
tests := []struct {
loader string
want string
}{
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
}
for _, tc := range tests {
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
}
}
}
func TestRunTaskHonorsCancel(t *testing.T) {
t.Parallel()
@@ -154,3 +171,34 @@ func TestRunTaskHonorsCancel(t *testing.T) {
t.Fatal("runTask did not return after cancel")
}
}
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
t.Parallel()
var gotDuration int
q := &taskQueue{
opts: &HandlerOptions{App: &app.App{}},
}
tk := &Task{
ID: "cpu-burn-1",
Name: "CPU Burn-in",
Target: "cpu",
Status: TaskRunning,
CreatedAt: time.Now(),
params: taskParams{BurnProfile: "smoke"},
}
j := &jobState{}
orig := runCPUAcceptancePackCtx
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
gotDuration = durationSec
return "/tmp/cpu-burn.tar.gz", nil
}
defer func() { runCPUAcceptancePackCtx = orig }()
q.runTask(tk, j, context.Background())
if gotDuration != 5*60 {
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
}
}

2
bible

Submodule bible updated: 688b87e98d...456c1f022c

View File

@@ -13,9 +13,10 @@ Use one of:
This applies to:
- `iso/builder/config/package-lists/*.list.chroot`
- Any package referenced in `grub.cfg`, hooks, or overlay scripts (e.g. file paths like `/boot/memtest86+x64.bin`)
- Any package referenced in bootloader configs, hooks, or overlay scripts
## Example of what goes wrong without this
## Memtest rule
`memtest86+` in Debian bookworm installs `/boot/memtest86+x64.bin`, not `/boot/memtest86+.bin`.
Guessing the filename caused a broken GRUB entry that only surfaced at boot time, after a full rebuild.
Prefer live-build's built-in memtest integration over custom hooks or hardcoded
bootloader paths. If you ever need to reference memtest files manually, verify
the exact package file list first for the target Debian release.

View File

@@ -29,7 +29,7 @@ lb config noauto \
--security true \
--linux-flavours "amd64" \
--linux-packages "${LB_LINUX_PACKAGES}" \
--memtest none \
--memtest memtest86+ \
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \

View File

@@ -36,6 +36,7 @@ typedef void *CUstream;
#define MAX_CUBLAS_PROFILES 5
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
#define STRESS_LAUNCH_DEPTH 8
static const char *ptx_source =
".version 6.0\n"
@@ -422,24 +423,31 @@ static int run_ptx_fallback(struct cuda_api *api,
double deadline = start + (double)seconds;
while (now_seconds() < deadline) {
launches_per_wave = 0;
for (int lane = 0; lane < stream_count; lane++) {
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
if (!check_rc(api,
"cuLaunchKernel",
api->cuLaunchKernel(kernel,
blocks,
1,
1,
threads,
1,
1,
0,
streams[lane],
params[lane],
NULL))) {
goto fail;
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
int launched_this_batch = 0;
for (int lane = 0; lane < stream_count; lane++) {
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
if (!check_rc(api,
"cuLaunchKernel",
api->cuLaunchKernel(kernel,
blocks,
1,
1,
threads,
1,
1,
0,
streams[lane],
params[lane],
NULL))) {
goto fail;
}
launches_per_wave++;
launched_this_batch++;
}
if (launched_this_batch <= 0) {
break;
}
launches_per_wave++;
}
if (launches_per_wave <= 0) {
goto fail;
@@ -460,10 +468,11 @@ static int run_ptx_fallback(struct cuda_api *api,
report->iterations = iterations;
snprintf(report->details,
sizeof(report->details),
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
size_mb,
report->buffer_mb,
report->stream_count,
STRESS_LAUNCH_DEPTH,
bytes_per_stream[0] / (1024u * 1024u),
iterations);
@@ -1184,10 +1193,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
append_detail(report->details,
sizeof(report->details),
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
size_mb,
report->buffer_mb,
report->stream_count,
STRESS_LAUNCH_DEPTH,
mp_count,
per_profile_budget / (1024u * 1024u));
@@ -1239,26 +1249,33 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
double deadline = now_seconds() + (double)seconds;
while (now_seconds() < deadline) {
wave_launches = 0;
for (int i = 0; i < prepared_count; i++) {
if (!prepared[i].ready) {
continue;
}
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
append_detail(report->details,
sizeof(report->details),
"%s=FAILED runtime\n",
prepared[i].desc.name);
for (int j = 0; j < prepared_count; j++) {
destroy_profile(&cublas, cuda, &prepared[j]);
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
int launched_this_batch = 0;
for (int i = 0; i < prepared_count; i++) {
if (!prepared[i].ready) {
continue;
}
cublas.cublasLtDestroy(handle);
destroy_streams(cuda, streams, stream_count);
cuda->cuCtxDestroy(ctx);
return 0;
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
append_detail(report->details,
sizeof(report->details),
"%s=FAILED runtime\n",
prepared[i].desc.name);
for (int j = 0; j < prepared_count; j++) {
destroy_profile(&cublas, cuda, &prepared[j]);
}
cublas.cublasLtDestroy(handle);
destroy_streams(cuda, streams, stream_count);
cuda->cuCtxDestroy(ctx);
return 0;
}
prepared[i].iterations++;
report->iterations++;
wave_launches++;
launched_this_batch++;
}
if (launched_this_batch <= 0) {
break;
}
prepared[i].iterations++;
report->iterations++;
wave_launches++;
}
if (wave_launches <= 0) {
break;

View File

@@ -111,8 +111,231 @@ resolve_iso_version() {
resolve_audit_version
}
iso_list_files() {
iso_path="$1"
if command -v bsdtar >/dev/null 2>&1; then
bsdtar -tf "$iso_path"
return $?
fi
if command -v xorriso >/dev/null 2>&1; then
xorriso -indev "$iso_path" -find / -type f -print 2>/dev/null | sed 's#^/##'
return $?
fi
return 127
}
iso_extract_file() {
iso_path="$1"
iso_member="$2"
if command -v bsdtar >/dev/null 2>&1; then
bsdtar -xOf "$iso_path" "$iso_member"
return $?
fi
if command -v xorriso >/dev/null 2>&1; then
xorriso -osirrox on -indev "$iso_path" -cat "/$iso_member" 2>/dev/null
return $?
fi
return 127
}
require_iso_reader() {
command -v bsdtar >/dev/null 2>&1 && return 0
command -v xorriso >/dev/null 2>&1 && return 0
memtest_fail "ISO reader is required for validation/debug (expected bsdtar or xorriso)" "${1:-}"
}
dump_memtest_debug() {
phase="$1"
lb_dir="${2:-}"
iso_path="${3:-}"
phase_slug="$(printf '%s' "${phase}" | tr ' /' '__')"
memtest_log="${LOG_DIR:-}/memtest-${phase_slug}.log"
(
echo "=== memtest debug: ${phase} ==="
echo "-- auto/config --"
if [ -f "${BUILDER_DIR}/auto/config" ]; then
grep -n -- '--memtest' "${BUILDER_DIR}/auto/config" || echo " (no --memtest line found)"
else
echo " (missing ${BUILDER_DIR}/auto/config)"
fi
echo "-- source bootloader templates --"
for cfg in \
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
if [ -f "$cfg" ]; then
echo " file: $cfg"
grep -n 'Memory Test\|memtest' "$cfg" || echo " (no memtest lines)"
fi
done
if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
echo "-- live-build workdir package lists --"
for pkg in \
"$lb_dir/config/package-lists/bee.list.chroot" \
"$lb_dir/config/package-lists/bee-gpu.list.chroot" \
"$lb_dir/config/package-lists/bee-nvidia.list.chroot"; do
if [ -f "$pkg" ]; then
echo " file: $pkg"
grep -n 'memtest' "$pkg" || echo " (no memtest lines)"
fi
done
echo "-- live-build chroot/boot --"
if [ -d "$lb_dir/chroot/boot" ]; then
find "$lb_dir/chroot/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/chroot/boot)"
fi
echo "-- live-build binary/boot --"
if [ -d "$lb_dir/binary/boot" ]; then
find "$lb_dir/binary/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/binary/boot)"
fi
echo "-- live-build package cache --"
if [ -d "$lb_dir/cache/packages.chroot" ]; then
find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/cache/packages.chroot)"
fi
fi
if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
echo "-- ISO memtest files --"
iso_list_files "$iso_path" | grep 'memtest' | sed 's/^/ /' || echo " (no memtest files in ISO)"
echo "-- ISO GRUB memtest lines --"
iso_extract_file "$iso_path" boot/grub/grub.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in boot/grub/grub.cfg)"
echo "-- ISO isolinux memtest lines --"
iso_extract_file "$iso_path" isolinux/live.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in isolinux/live.cfg)"
fi
echo "=== end memtest debug: ${phase} ==="
) | {
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ]; then
tee "${memtest_log}"
else
cat
fi
}
}
memtest_fail() {
msg="$1"
iso_path="${2:-}"
echo "ERROR: ${msg}" >&2
dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
exit 1
}
validate_iso_memtest() {
iso_path="$1"
echo "=== validating memtest in ISO ==="
[ -f "$iso_path" ] || memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
require_iso_reader "$iso_path"
iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.bin$' || {
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
}
iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.efi$' || {
memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
}
grub_cfg="$(mktemp)"
isolinux_cfg="$(mktemp)"
iso_extract_file "$iso_path" boot/grub/grub.cfg > "$grub_cfg" || memtest_fail "failed to extract boot/grub/grub.cfg from ISO" "$iso_path"
iso_extract_file "$iso_path" isolinux/live.cfg > "$isolinux_cfg" || memtest_fail "failed to extract isolinux/live.cfg from ISO" "$iso_path"
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
}
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
memtest_fail "isolinux memtest path is missing" "$iso_path"
}
rm -f "$grub_cfg" "$isolinux_cfg"
echo "=== memtest validation OK ==="
}
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
LOG_DIR="${DIST_DIR}/${ISO_BASENAME}.logs"
LOG_ARCHIVE="${DIST_DIR}/${ISO_BASENAME}.logs.tar.gz"
ISO_OUT="${DIST_DIR}/${ISO_BASENAME}.iso"
LOG_OUT="${LOG_DIR}/build.log"
cleanup_build_log() {
status="${1:-$?}"
trap - EXIT INT TERM HUP
if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
BUILD_LOG_ACTIVE=0
exec 1>&3 2>&4
exec 3>&- 4>&-
if [ -n "${BUILD_TEE_PID:-}" ]; then
wait "${BUILD_TEE_PID}" 2>/dev/null || true
fi
rm -f "${BUILD_LOG_PIPE}"
fi
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
rm -f "${LOG_ARCHIVE}"
tar -czf "${LOG_ARCHIVE}" -C "${DIST_DIR}" "$(basename "${LOG_DIR}")" 2>/dev/null || true
fi
exit "${status}"
}
start_build_log() {
command -v tee >/dev/null 2>&1 || {
echo "ERROR: tee is required for build logging" >&2
exit 1
}
rm -rf "${LOG_DIR}"
rm -f "${LOG_ARCHIVE}"
mkdir -p "${LOG_DIR}"
BUILD_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-build-log.XXXXXX")"
mkfifo "${BUILD_LOG_PIPE}"
exec 3>&1 4>&2
tee "${LOG_OUT}" < "${BUILD_LOG_PIPE}" &
BUILD_TEE_PID=$!
exec > "${BUILD_LOG_PIPE}" 2>&1
BUILD_LOG_ACTIVE=1
trap 'cleanup_build_log "$?"' EXIT INT TERM HUP
echo "=== build log dir: ${LOG_DIR} ==="
echo "=== build log: ${LOG_OUT} ==="
echo "=== build log archive: ${LOG_ARCHIVE} ==="
}
start_build_log
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
# If headers for the detected ABI are not yet installed (kernel updated since image build),
@@ -245,13 +468,13 @@ rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
rm -rf \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
# Remove NVIDIA-specific overlay files for non-nvidia variants
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
@@ -304,7 +527,6 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true
ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
fi
# --- inject smoketest into overlay so it runs directly on the live CD ---
@@ -510,6 +732,7 @@ export BEE_GPU_VENDOR_UPPER
cd "${LB_DIR}"
lb clean 2>&1 | tail -3
lb config 2>&1 | tail -5
dump_memtest_debug "pre-build" "${LB_DIR}"
lb build 2>&1
# --- persist deb package cache back to shared location ---
@@ -521,8 +744,9 @@ fi
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
if [ -f "$ISO_RAW" ]; then
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
validate_iso_memtest "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT"
echo ""
echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -22,3 +22,7 @@ label live-@FLAVOUR@-failsafe
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
label memtest
menu label ^Memory Test (memtest86+)
linux /boot/memtest86+x64.bin

View File

@@ -1,76 +0,0 @@
#!/bin/sh
# Copy memtest86+ binaries from chroot /boot into the ISO boot directory
# so GRUB can chainload them directly (they must be on the ISO filesystem,
# not inside the squashfs).
#
# Primary: copy from chroot/boot/ (populated by package postinst).
# Naming fallbacks:
# Debian Bookworm: /boot/memtest86+ — EFI PE64 (no extension)
# /boot/memtest86+.bin — legacy binary
# Upstream/Ubuntu: /boot/memtest86+x64.efi, /boot/memtest86+x64.bin, etc.
# Last resort: extract directly from the cached .deb if postinst didn't place
# the files (happens in chroot environments without grub triggers).
set -e
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi"
# Ensure destination directory exists (absence caused silent copy failures).
mkdir -p binary/boot
echo "memtest: scanning chroot/boot/ for memtest files:"
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files in chroot/boot/"
# Primary path: copy upstream-named files from chroot/boot/
for f in ${MEMTEST_FILES}; do
src="chroot/boot/${f}"
if [ -f "${src}" ]; then
cp "${src}" "binary/boot/${f}"
echo "memtest: copied ${f} from chroot/boot/"
fi
done
# Debian Bookworm naming fallback: /boot/memtest86+ (no extension) is the EFI binary.
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "chroot/boot/memtest86+" ]; then
cp "chroot/boot/memtest86+" "binary/boot/memtest86+x64.efi"
echo "memtest: copied /boot/memtest86+ as memtest86+x64.efi (Debian naming)"
fi
if [ ! -f "binary/boot/memtest86+x64.bin" ] && [ -f "chroot/boot/memtest86+.bin" ]; then
cp "chroot/boot/memtest86+.bin" "binary/boot/memtest86+x64.bin"
echo "memtest: copied /boot/memtest86+.bin as memtest86+x64.bin (Debian naming)"
fi
# Last resort: if EFI binary still missing, extract from cached .deb
if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
echo "memtest: EFI binary missing — attempting extraction from .deb cache"
deb=$(find chroot/var/cache/apt/archives/ chroot/var/lib/apt/lists/ \
-name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null \
| head -1)
if [ -z "$deb" ]; then
deb=$(find cache/ -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null | head -1)
fi
if [ -n "$deb" ]; then
echo "memtest: extracting from ${deb}"
EXTRACT_DIR="$(mktemp -d)"
dpkg-deb -x "${deb}" "${EXTRACT_DIR}"
echo "memtest: files found in .deb:"
find "${EXTRACT_DIR}/boot" -type f 2>/dev/null || echo " (none in /boot)"
for f in ${MEMTEST_FILES}; do
src="${EXTRACT_DIR}/boot/${f}"
if [ -f "${src}" ]; then
cp "${src}" "binary/boot/${f}"
echo "memtest: extracted ${f} from .deb"
fi
done
# Debian naming fallback inside .deb as well
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "${EXTRACT_DIR}/boot/memtest86+" ]; then
cp "${EXTRACT_DIR}/boot/memtest86+" "binary/boot/memtest86+x64.efi"
echo "memtest: extracted /boot/memtest86+ as memtest86+x64.efi from .deb"
fi
rm -rf "${EXTRACT_DIR}"
else
echo "memtest: WARNING: no memtest86+ .deb found in cache — memtest will not be available"
fi
fi
echo "memtest: binary/boot/ contents:"
ls binary/boot/memtest* 2>/dev/null || echo " (none)"

View File

@@ -21,14 +21,15 @@ openssh-server
# Disk installer
squashfs-tools
parted
# grub-pc / grub-efi-amd64 provide grub-install + grub2-common (required for chroot install).
# The -bin variants only carry binary modules and do NOT include grub-install itself.
grub-pc
# Keep GRUB install tools without selecting a single active platform package.
# grub-pc and grub-efi-amd64 conflict with each other, but grub2-common
# provides grub-install/update-grub and the *-bin packages provide BIOS/UEFI modules.
grub2-common
grub-pc-bin
grub-efi-amd64
grub-efi-amd64-bin
grub-efi-amd64-signed
shim-signed
efibootmgr
# Filesystem support for USB export targets
exfatprogs
@@ -50,7 +51,6 @@ sudo
zstd
mstflint
memtester
memtest86+
stress-ng
stressapptest

View File

@@ -1,25 +1,9 @@
[Unit]
Description=Bee: schedule startup hardware audit via task queue
# Start AFTER bee-web, not before — bee-web must not wait for audit.
After=bee-web.service
Wants=bee-web.service
Description=Bee: on-demand hardware audit (not started automatically)
[Service]
Type=oneshot
RemainAfterExit=yes
# Wait up to 90s for bee-web to respond on /healthz, then sleep 60s for
# the system to settle (GPU drivers, sensors), then enqueue the audit as
# a background task so it appears in the task list and logs.
ExecStart=/bin/sh -c '\
i=0; \
while [ $i -lt 90 ]; do \
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi; \
sleep 1; i=$((i+1)); \
done; \
sleep 60; \
curl -sf -X POST http://localhost/api/audit/run >/dev/null'
ExecStart=/bin/sh -c 'curl -sf -X POST http://localhost/api/audit/run >/dev/null'
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target

View File

@@ -12,17 +12,55 @@
set -euo pipefail
usage() {
cat >&2 <<'EOF'
Usage: bee-install <device> [logfile]
Installs the live system to a local disk (WIPES the target).
device Target block device, e.g. /dev/sda or /dev/nvme0n1
Must be a hard disk or NVMe — NOT a CD-ROM (/dev/sr*)
logfile Optional path for progress log (default: /tmp/bee-install.log)
Examples:
bee-install /dev/sda
bee-install /dev/nvme0n1
bee-install /dev/sdb /tmp/my-install.log
WARNING: ALL DATA ON <device> WILL BE ERASED.
Layout (UEFI): GPT — partition 1: EFI 512MB vfat, partition 2: root ext4
Layout (BIOS): MBR — partition 1: root ext4
EOF
exit 1
}
DEVICE="${1:-}"
LOGFILE="${2:-/tmp/bee-install.log}"
if [ -z "$DEVICE" ]; then
echo "Usage: bee-install <device> [logfile]" >&2
exit 1
if [ -z "$DEVICE" ] || [ "$DEVICE" = "--help" ] || [ "$DEVICE" = "-h" ]; then
usage
fi
if [ ! -b "$DEVICE" ]; then
echo "ERROR: $DEVICE is not a block device" >&2
echo "Run 'lsblk' to list available disks." >&2
exit 1
fi
# Block CD-ROM devices
case "$DEVICE" in
/dev/sr*|/dev/scd*)
echo "ERROR: $DEVICE is a CD-ROM/optical device — cannot install to it." >&2
echo "Run 'lsblk' to find the target disk (e.g. /dev/sda, /dev/nvme0n1)." >&2
exit 1
;;
esac
# Check required tools
for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
if ! command -v "$tool" >/dev/null 2>&1; then
echo "ERROR: required tool not found: $tool" >&2
exit 1
fi
done
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
if [ ! -f "$SQUASHFS" ]; then

View File

@@ -7,6 +7,8 @@ EXCLUDE=""
FORMAT=""
JOHN_DIR="/usr/local/lib/bee/john/run"
JOHN_BIN="${JOHN_DIR}/john"
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
usage() {
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
@@ -23,6 +25,95 @@ contains_csv() {
echo ",${haystack}," | grep -q ",${needle},"
}
show_opencl_diagnostics() {
echo "-- OpenCL ICD vendors --" >&2
if [ -d /etc/OpenCL/vendors ]; then
ls -l /etc/OpenCL/vendors >&2 || true
for icd in /etc/OpenCL/vendors/*.icd; do
[ -f "${icd}" ] || continue
echo " file: ${icd}" >&2
sed 's/^/ /' "${icd}" >&2 || true
done
else
echo " /etc/OpenCL/vendors is missing" >&2
fi
echo "-- NVIDIA device nodes --" >&2
ls -l /dev/nvidia* >&2 || true
echo "-- ldconfig OpenCL/NVIDIA --" >&2
ldconfig -p 2>/dev/null | grep 'libOpenCL\|libcuda\|libnvidia-opencl' >&2 || true
if command -v clinfo >/dev/null 2>&1; then
echo "-- clinfo -l --" >&2
clinfo -l >&2 || true
fi
echo "-- john --list=opencl-devices --" >&2
./john --list=opencl-devices >&2 || true
}
refresh_nvidia_runtime() {
if [ "$(id -u)" != "0" ]; then
return 1
fi
if command -v bee-nvidia-load >/dev/null 2>&1; then
bee-nvidia-load >/dev/null 2>&1 || true
fi
ldconfig >/dev/null 2>&1 || true
return 0
}
ensure_nvidia_uvm() {
if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
return 0
fi
if [ "$(id -u)" != "0" ]; then
return 1
fi
ko="/usr/local/lib/nvidia/nvidia-uvm.ko"
[ -f "${ko}" ] || return 1
if ! insmod "${ko}" >/dev/null 2>&1; then
return 1
fi
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices | awk '{print $1}')
if [ -n "${uvm_major}" ]; then
mknod -m 666 /dev/nvidia-uvm c "${uvm_major}" 0 2>/dev/null || true
mknod -m 666 /dev/nvidia-uvm-tools c "${uvm_major}" 1 2>/dev/null || true
fi
return 0
}
ensure_opencl_ready() {
out=$(./john --list=opencl-devices 2>&1 || true)
if echo "${out}" | grep -q "Device #"; then
return 0
fi
if refresh_nvidia_runtime; then
out=$(./john --list=opencl-devices 2>&1 || true)
if echo "${out}" | grep -q "Device #"; then
return 0
fi
fi
if ensure_nvidia_uvm; then
out=$(./john --list=opencl-devices 2>&1 || true)
if echo "${out}" | grep -q "Device #"; then
return 0
fi
fi
echo "OpenCL devices are not available for John." >&2
if ! lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
echo "nvidia_uvm is not loaded." >&2
fi
if [ ! -e /dev/nvidia-uvm ]; then
echo "/dev/nvidia-uvm is missing." >&2
fi
show_opencl_diagnostics
return 1
}
while [ "$#" -gt 0 ]; do
case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
@@ -76,6 +167,8 @@ echo "john_devices=${JOHN_DEVICES}"
cd "${JOHN_DIR}"
ensure_opencl_ready || exit 1
choose_format() {
if [ -n "${FORMAT}" ]; then
echo "${FORMAT}"

View File

@@ -17,7 +17,7 @@ mkdir -p "$(dirname "$log_file")"
serial_sink() {
local tty="$1"
if [ -w "$tty" ]; then
cat > "$tty"
cat > "$tty" 2>/dev/null || true
else
cat > /dev/null
fi

View File

@@ -59,11 +59,24 @@ load_module() {
return 1
}
load_host_module() {
mod="$1"
if modprobe "$mod" >/dev/null 2>&1; then
log "host module loaded: $mod"
return 0
fi
return 1
}
case "$nvidia_mode" in
normal|full)
if ! load_module nvidia; then
exit 1
fi
# nvidia-modeset on some server kernels needs ACPI video helper symbols
# exported by the generic "video" module. Best-effort only; compute paths
# remain functional even if display-related modules stay absent.
load_host_module video || true
load_module nvidia-modeset || true
load_module nvidia-uvm || true
;;