Compare commits

...

8 Commits
v3.10 ... v3.13

19 changed files with 481 additions and 170 deletions

View File

@@ -10,9 +10,11 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"runtime"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
"syscall"
"time" "time"
) )
@@ -374,10 +376,17 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
return nil, fmt.Errorf("stressapptest not found: %w", err) return nil, fmt.Errorf("stressapptest not found: %w", err)
} }
// Use a very long duration; the context timeout will kill it at the right time. // Use a very long duration; the context timeout will kill it at the right time.
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test") cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
if threads := platformStressCPUThreads(); threads > 0 {
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
}
if mb := platformStressMemoryMB(); mb > 0 {
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
}
cmd := exec.CommandContext(ctx, path, cmdArgs...)
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
if err := cmd.Start(); err != nil { if err := startLowPriorityCmd(cmd, 15); err != nil {
return nil, fmt.Errorf("stressapptest start: %w", err) return nil, fmt.Errorf("stressapptest start: %w", err)
} }
return cmd, nil return cmd, nil
@@ -418,7 +427,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile) cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
_ = cmd.Start() _ = startLowPriorityCmd(cmd, 10)
return cmd return cmd
} }
@@ -433,10 +442,50 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64") cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
_ = cmd.Start() _ = startLowPriorityCmd(cmd, 10)
return cmd return cmd
} }
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
if err := cmd.Start(); err != nil {
return err
}
if cmd.Process != nil {
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
}
return nil
}
func platformStressCPUThreads() int {
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
return n
}
cpus := runtime.NumCPU()
switch {
case cpus <= 2:
return 1
case cpus <= 8:
return cpus - 1
default:
return cpus - 2
}
}
func platformStressMemoryMB() int {
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
return mb
}
free := freeMemBytes()
if free <= 0 {
return 0
}
mb := int((free * 60) / 100 / (1024 * 1024))
if mb < 1024 {
return 1024
}
return mb
}
func packPlatformDir(dir, dest string) error { func packPlatformDir(dir, dest string) error {
f, err := os.Create(dest) f, err := os.Create(dest)
if err != nil { if err != nil {

View File

@@ -0,0 +1,34 @@
package platform
import (
"runtime"
"testing"
)
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
if got := platformStressCPUThreads(); got != 7 {
t.Fatalf("platformStressCPUThreads=%d want 7", got)
}
}
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
got := platformStressCPUThreads()
if got < 1 {
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
}
if got > runtime.NumCPU() {
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
}
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
}
}
func TestPlatformStressMemoryMBOverride(t *testing.T) {
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
if got := platformStressMemoryMB(); got != 8192 {
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
}
}

View File

@@ -667,6 +667,22 @@ func (h *handler) handleAPIInstallStream(w http.ResponseWriter, r *http.Request)
// ── Metrics SSE ─────────────────────────────────────────────────────────────── // ── Metrics SSE ───────────────────────────────────────────────────────────────
func (h *handler) handleAPIMetricsLatest(w http.ResponseWriter, r *http.Request) {
sample, ok := h.latestMetric()
if !ok {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte("{}"))
return
}
b, err := json.Marshal(sample)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write(b)
}
func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) {
if !sseStart(w) { if !sseStart(w) {
return return

View File

@@ -532,16 +532,10 @@ function refreshCharts() {
} }
setInterval(refreshCharts, 3000); setInterval(refreshCharts, 3000);
const es = new EventSource('/api/metrics/stream'); fetch('/api/metrics/latest').then(r => r.json()).then(d => {
es.addEventListener('metrics', e => {
const d = JSON.parse(e.data);
// Show/hide Fan RPM card based on data availability
const fanCard = document.getElementById('card-server-fans'); const fanCard = document.getElementById('card-server-fans');
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none'; if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
}).catch(() => {});
});
es.onerror = () => {};
</script>` </script>`
} }

View File

@@ -270,6 +270,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export // Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream) mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG) mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG)
mux.HandleFunc("GET /api/metrics/export.csv", h.handleAPIMetricsExportCSV) mux.HandleFunc("GET /api/metrics/export.csv", h.handleAPIMetricsExportCSV)
@@ -1230,13 +1231,6 @@ probe();
func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) { func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
page := strings.TrimPrefix(r.URL.Path, "/") page := strings.TrimPrefix(r.URL.Path, "/")
if page == "" { if page == "" {
// Serve loading page until audit snapshot exists
if _, err := os.Stat(h.opts.AuditPath); err != nil {
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "text/html; charset=utf-8")
_, _ = w.Write([]byte(loadingPageHTML))
return
}
page = "dashboard" page = "dashboard"
} }
// Redirect old routes to new names // Redirect old routes to new names

2
bible

Submodule bible updated: 688b87e98d...456c1f022c

View File

@@ -13,9 +13,10 @@ Use one of:
This applies to: This applies to:
- `iso/builder/config/package-lists/*.list.chroot` - `iso/builder/config/package-lists/*.list.chroot`
- Any package referenced in `grub.cfg`, hooks, or overlay scripts (e.g. file paths like `/boot/memtest86+x64.bin`) - Any package referenced in bootloader configs, hooks, or overlay scripts
## Example of what goes wrong without this ## Memtest rule
`memtest86+` in Debian bookworm installs `/boot/memtest86+x64.bin`, not `/boot/memtest86+.bin`. Prefer live-build's built-in memtest integration over custom hooks or hardcoded
Guessing the filename caused a broken GRUB entry that only surfaced at boot time, after a full rebuild. bootloader paths. If you ever need to reference memtest files manually, verify
the exact package file list first for the target Debian release.

View File

@@ -29,7 +29,7 @@ lb config noauto \
--security true \ --security true \
--linux-flavours "amd64" \ --linux-flavours "amd64" \
--linux-packages "${LB_LINUX_PACKAGES}" \ --linux-packages "${LB_LINUX_PACKAGES}" \
--memtest none \ --memtest memtest86+ \
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \

View File

@@ -36,6 +36,7 @@ typedef void *CUstream;
#define MAX_CUBLAS_PROFILES 5 #define MAX_CUBLAS_PROFILES 5
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u) #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u) #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
#define STRESS_LAUNCH_DEPTH 8
static const char *ptx_source = static const char *ptx_source =
".version 6.0\n" ".version 6.0\n"
@@ -422,24 +423,31 @@ static int run_ptx_fallback(struct cuda_api *api,
double deadline = start + (double)seconds; double deadline = start + (double)seconds;
while (now_seconds() < deadline) { while (now_seconds() < deadline) {
launches_per_wave = 0; launches_per_wave = 0;
for (int lane = 0; lane < stream_count; lane++) { for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads); int launched_this_batch = 0;
if (!check_rc(api, for (int lane = 0; lane < stream_count; lane++) {
"cuLaunchKernel", unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
api->cuLaunchKernel(kernel, if (!check_rc(api,
blocks, "cuLaunchKernel",
1, api->cuLaunchKernel(kernel,
1, blocks,
threads, 1,
1, 1,
1, threads,
0, 1,
streams[lane], 1,
params[lane], 0,
NULL))) { streams[lane],
goto fail; params[lane],
NULL))) {
goto fail;
}
launches_per_wave++;
launched_this_batch++;
}
if (launched_this_batch <= 0) {
break;
} }
launches_per_wave++;
} }
if (launches_per_wave <= 0) { if (launches_per_wave <= 0) {
goto fail; goto fail;
@@ -460,10 +468,11 @@ static int run_ptx_fallback(struct cuda_api *api,
report->iterations = iterations; report->iterations = iterations;
snprintf(report->details, snprintf(report->details,
sizeof(report->details), sizeof(report->details),
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n", "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
size_mb, size_mb,
report->buffer_mb, report->buffer_mb,
report->stream_count, report->stream_count,
STRESS_LAUNCH_DEPTH,
bytes_per_stream[0] / (1024u * 1024u), bytes_per_stream[0] / (1024u * 1024u),
iterations); iterations);
@@ -1184,10 +1193,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
report->buffer_mb = (int)(total_budget / (1024u * 1024u)); report->buffer_mb = (int)(total_budget / (1024u * 1024u));
append_detail(report->details, append_detail(report->details,
sizeof(report->details), sizeof(report->details),
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n", "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
size_mb, size_mb,
report->buffer_mb, report->buffer_mb,
report->stream_count, report->stream_count,
STRESS_LAUNCH_DEPTH,
mp_count, mp_count,
per_profile_budget / (1024u * 1024u)); per_profile_budget / (1024u * 1024u));
@@ -1239,26 +1249,33 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
double deadline = now_seconds() + (double)seconds; double deadline = now_seconds() + (double)seconds;
while (now_seconds() < deadline) { while (now_seconds() < deadline) {
wave_launches = 0; wave_launches = 0;
for (int i = 0; i < prepared_count; i++) { for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
if (!prepared[i].ready) { int launched_this_batch = 0;
continue; for (int i = 0; i < prepared_count; i++) {
} if (!prepared[i].ready) {
if (!run_cublas_profile(handle, &cublas, &prepared[i])) { continue;
append_detail(report->details,
sizeof(report->details),
"%s=FAILED runtime\n",
prepared[i].desc.name);
for (int j = 0; j < prepared_count; j++) {
destroy_profile(&cublas, cuda, &prepared[j]);
} }
cublas.cublasLtDestroy(handle); if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
destroy_streams(cuda, streams, stream_count); append_detail(report->details,
cuda->cuCtxDestroy(ctx); sizeof(report->details),
return 0; "%s=FAILED runtime\n",
prepared[i].desc.name);
for (int j = 0; j < prepared_count; j++) {
destroy_profile(&cublas, cuda, &prepared[j]);
}
cublas.cublasLtDestroy(handle);
destroy_streams(cuda, streams, stream_count);
cuda->cuCtxDestroy(ctx);
return 0;
}
prepared[i].iterations++;
report->iterations++;
wave_launches++;
launched_this_batch++;
}
if (launched_this_batch <= 0) {
break;
} }
prepared[i].iterations++;
report->iterations++;
wave_launches++;
} }
if (wave_launches <= 0) { if (wave_launches <= 0) {
break; break;

View File

@@ -111,8 +111,192 @@ resolve_iso_version() {
resolve_audit_version resolve_audit_version
} }
dump_memtest_debug() {
phase="$1"
lb_dir="${2:-}"
iso_path="${3:-}"
phase_slug="$(printf '%s' "${phase}" | tr ' /' '__')"
memtest_log="${LOG_DIR:-}/memtest-${phase_slug}.log"
(
echo "=== memtest debug: ${phase} ==="
echo "-- auto/config --"
if [ -f "${BUILDER_DIR}/auto/config" ]; then
grep -n -- '--memtest' "${BUILDER_DIR}/auto/config" || echo " (no --memtest line found)"
else
echo " (missing ${BUILDER_DIR}/auto/config)"
fi
echo "-- source bootloader templates --"
for cfg in \
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
if [ -f "$cfg" ]; then
echo " file: $cfg"
grep -n 'Memory Test\|memtest' "$cfg" || echo " (no memtest lines)"
fi
done
if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
echo "-- live-build workdir package lists --"
for pkg in \
"$lb_dir/config/package-lists/bee.list.chroot" \
"$lb_dir/config/package-lists/bee-gpu.list.chroot" \
"$lb_dir/config/package-lists/bee-nvidia.list.chroot"; do
if [ -f "$pkg" ]; then
echo " file: $pkg"
grep -n 'memtest' "$pkg" || echo " (no memtest lines)"
fi
done
echo "-- live-build chroot/boot --"
if [ -d "$lb_dir/chroot/boot" ]; then
find "$lb_dir/chroot/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/chroot/boot)"
fi
echo "-- live-build binary/boot --"
if [ -d "$lb_dir/binary/boot" ]; then
find "$lb_dir/binary/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/binary/boot)"
fi
echo "-- live-build package cache --"
if [ -d "$lb_dir/cache/packages.chroot" ]; then
find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/ /' || true
else
echo " (missing $lb_dir/cache/packages.chroot)"
fi
fi
if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
echo "-- ISO memtest files --"
bsdtar -tf "$iso_path" | grep 'memtest' | sed 's/^/ /' || echo " (no memtest files in ISO)"
echo "-- ISO GRUB memtest lines --"
bsdtar -xOf "$iso_path" boot/grub/grub.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in boot/grub/grub.cfg)"
echo "-- ISO isolinux memtest lines --"
bsdtar -xOf "$iso_path" isolinux/live.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in isolinux/live.cfg)"
fi
echo "=== end memtest debug: ${phase} ==="
) | {
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ]; then
tee "${memtest_log}"
else
cat
fi
}
}
memtest_fail() {
msg="$1"
iso_path="${2:-}"
echo "ERROR: ${msg}" >&2
dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
exit 1
}
validate_iso_memtest() {
iso_path="$1"
echo "=== validating memtest in ISO ==="
[ -f "$iso_path" ] || memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
command -v bsdtar >/dev/null 2>&1 || memtest_fail "bsdtar is required for ISO validation" "$iso_path"
bsdtar -tf "$iso_path" | grep -q '^boot/memtest86+x64\.bin$' || {
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
}
bsdtar -tf "$iso_path" | grep -q '^boot/memtest86+x64\.efi$' || {
memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
}
grub_cfg="$(mktemp)"
isolinux_cfg="$(mktemp)"
bsdtar -xOf "$iso_path" boot/grub/grub.cfg > "$grub_cfg" || memtest_fail "failed to extract boot/grub/grub.cfg from ISO" "$iso_path"
bsdtar -xOf "$iso_path" isolinux/live.cfg > "$isolinux_cfg" || memtest_fail "failed to extract isolinux/live.cfg from ISO" "$iso_path"
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
}
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
}
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
memtest_fail "isolinux memtest path is missing" "$iso_path"
}
rm -f "$grub_cfg" "$isolinux_cfg"
echo "=== memtest validation OK ==="
}
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)" AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)" ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
LOG_DIR="${DIST_DIR}/${ISO_BASENAME}.logs"
LOG_ARCHIVE="${DIST_DIR}/${ISO_BASENAME}.logs.tar.gz"
ISO_OUT="${DIST_DIR}/${ISO_BASENAME}.iso"
LOG_OUT="${LOG_DIR}/build.log"
cleanup_build_log() {
status="${1:-$?}"
trap - EXIT INT TERM HUP
if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
BUILD_LOG_ACTIVE=0
exec 1>&3 2>&4
exec 3>&- 4>&-
if [ -n "${BUILD_TEE_PID:-}" ]; then
wait "${BUILD_TEE_PID}" 2>/dev/null || true
fi
rm -f "${BUILD_LOG_PIPE}"
fi
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
rm -f "${LOG_ARCHIVE}"
tar -czf "${LOG_ARCHIVE}" -C "${DIST_DIR}" "$(basename "${LOG_DIR}")" 2>/dev/null || true
fi
exit "${status}"
}
start_build_log() {
command -v tee >/dev/null 2>&1 || {
echo "ERROR: tee is required for build logging" >&2
exit 1
}
rm -rf "${LOG_DIR}"
rm -f "${LOG_ARCHIVE}"
mkdir -p "${LOG_DIR}"
BUILD_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-build-log.XXXXXX")"
mkfifo "${BUILD_LOG_PIPE}"
exec 3>&1 4>&2
tee "${LOG_OUT}" < "${BUILD_LOG_PIPE}" &
BUILD_TEE_PID=$!
exec > "${BUILD_LOG_PIPE}" 2>&1
BUILD_LOG_ACTIVE=1
trap 'cleanup_build_log "$?"' EXIT INT TERM HUP
echo "=== build log dir: ${LOG_DIR} ==="
echo "=== build log: ${LOG_OUT} ==="
echo "=== build log archive: ${LOG_ARCHIVE} ==="
}
start_build_log
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency. # Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
# If headers for the detected ABI are not yet installed (kernel updated since image build), # If headers for the detected ABI are not yet installed (kernel updated since image build),
@@ -245,13 +429,13 @@ rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-release" \ "${OVERLAY_STAGE_DIR}/etc/bee-release" \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \ "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \ "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
rm -rf \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
# Remove NVIDIA-specific overlay files for non-nvidia variants # Remove NVIDIA-specific overlay files for non-nvidia variants
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
@@ -304,7 +488,6 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true
ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
fi fi
# --- inject smoketest into overlay so it runs directly on the live CD --- # --- inject smoketest into overlay so it runs directly on the live CD ---
@@ -510,6 +693,7 @@ export BEE_GPU_VENDOR_UPPER
cd "${LB_DIR}" cd "${LB_DIR}"
lb clean 2>&1 | tail -3 lb clean 2>&1 | tail -3
lb config 2>&1 | tail -5 lb config 2>&1 | tail -5
dump_memtest_debug "pre-build" "${LB_DIR}"
lb build 2>&1 lb build 2>&1
# --- persist deb package cache back to shared location --- # --- persist deb package cache back to shared location ---
@@ -521,8 +705,9 @@ fi
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR # live-build outputs live-image-amd64.hybrid.iso in LB_DIR
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso" ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
if [ -f "$ISO_RAW" ]; then if [ -f "$ISO_RAW" ]; then
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
validate_iso_memtest "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT" cp "$ISO_RAW" "$ISO_OUT"
echo "" echo ""
echo "=== done (${BEE_GPU_VENDOR}) ===" echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -22,3 +22,7 @@ label live-@FLAVOUR@-failsafe
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
label memtest
menu label ^Memory Test (memtest86+)
linux /boot/memtest86+x64.bin

View File

@@ -1,76 +0,0 @@
#!/bin/sh
# Copy memtest86+ binaries from chroot /boot into the ISO boot directory
# so GRUB can chainload them directly (they must be on the ISO filesystem,
# not inside the squashfs).
#
# Primary: copy from chroot/boot/ (populated by package postinst).
# Naming fallbacks:
# Debian Bookworm: /boot/memtest86+ — EFI PE64 (no extension)
# /boot/memtest86+.bin — legacy binary
# Upstream/Ubuntu: /boot/memtest86+x64.efi, /boot/memtest86+x64.bin, etc.
# Last resort: extract directly from the cached .deb if postinst didn't place
# the files (happens in chroot environments without grub triggers).
set -e
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi"
# Ensure destination directory exists (absence caused silent copy failures).
mkdir -p binary/boot
echo "memtest: scanning chroot/boot/ for memtest files:"
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files in chroot/boot/"
# Primary path: copy upstream-named files from chroot/boot/
for f in ${MEMTEST_FILES}; do
src="chroot/boot/${f}"
if [ -f "${src}" ]; then
cp "${src}" "binary/boot/${f}"
echo "memtest: copied ${f} from chroot/boot/"
fi
done
# Debian Bookworm naming fallback: /boot/memtest86+ (no extension) is the EFI binary.
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "chroot/boot/memtest86+" ]; then
cp "chroot/boot/memtest86+" "binary/boot/memtest86+x64.efi"
echo "memtest: copied /boot/memtest86+ as memtest86+x64.efi (Debian naming)"
fi
if [ ! -f "binary/boot/memtest86+x64.bin" ] && [ -f "chroot/boot/memtest86+.bin" ]; then
cp "chroot/boot/memtest86+.bin" "binary/boot/memtest86+x64.bin"
echo "memtest: copied /boot/memtest86+.bin as memtest86+x64.bin (Debian naming)"
fi
# Last resort: if EFI binary still missing, extract from cached .deb
if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
echo "memtest: EFI binary missing — attempting extraction from .deb cache"
deb=$(find chroot/var/cache/apt/archives/ chroot/var/lib/apt/lists/ \
-name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null \
| head -1)
if [ -z "$deb" ]; then
deb=$(find cache/ -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null | head -1)
fi
if [ -n "$deb" ]; then
echo "memtest: extracting from ${deb}"
EXTRACT_DIR="$(mktemp -d)"
dpkg-deb -x "${deb}" "${EXTRACT_DIR}"
echo "memtest: files found in .deb:"
find "${EXTRACT_DIR}/boot" -type f 2>/dev/null || echo " (none in /boot)"
for f in ${MEMTEST_FILES}; do
src="${EXTRACT_DIR}/boot/${f}"
if [ -f "${src}" ]; then
cp "${src}" "binary/boot/${f}"
echo "memtest: extracted ${f} from .deb"
fi
done
# Debian naming fallback inside .deb as well
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "${EXTRACT_DIR}/boot/memtest86+" ]; then
cp "${EXTRACT_DIR}/boot/memtest86+" "binary/boot/memtest86+x64.efi"
echo "memtest: extracted /boot/memtest86+ as memtest86+x64.efi from .deb"
fi
rm -rf "${EXTRACT_DIR}"
else
echo "memtest: WARNING: no memtest86+ .deb found in cache — memtest will not be available"
fi
fi
echo "memtest: binary/boot/ contents:"
ls binary/boot/memtest* 2>/dev/null || echo " (none)"

View File

@@ -21,14 +21,15 @@ openssh-server
# Disk installer # Disk installer
squashfs-tools squashfs-tools
parted parted
# grub-pc / grub-efi-amd64 provide grub-install + grub2-common (required for chroot install). # Keep GRUB install tools without selecting a single active platform package.
# The -bin variants only carry binary modules and do NOT include grub-install itself. # grub-pc and grub-efi-amd64 conflict with each other, but grub2-common
grub-pc # provides grub-install/update-grub and the *-bin packages provide BIOS/UEFI modules.
grub2-common
grub-pc-bin grub-pc-bin
grub-efi-amd64
grub-efi-amd64-bin grub-efi-amd64-bin
grub-efi-amd64-signed grub-efi-amd64-signed
shim-signed shim-signed
efibootmgr
# Filesystem support for USB export targets # Filesystem support for USB export targets
exfatprogs exfatprogs
@@ -50,7 +51,6 @@ sudo
zstd zstd
mstflint mstflint
memtester memtester
memtest86+
stress-ng stress-ng
stressapptest stressapptest

View File

@@ -1,25 +1,9 @@
[Unit] [Unit]
Description=Bee: schedule startup hardware audit via task queue Description=Bee: on-demand hardware audit (not started automatically)
# Start AFTER bee-web, not before — bee-web must not wait for audit.
After=bee-web.service
Wants=bee-web.service
[Service] [Service]
Type=oneshot Type=oneshot
RemainAfterExit=yes RemainAfterExit=yes
# Wait up to 90s for bee-web to respond on /healthz, then sleep 60s for ExecStart=/bin/sh -c 'curl -sf -X POST http://localhost/api/audit/run >/dev/null'
# the system to settle (GPU drivers, sensors), then enqueue the audit as
# a background task so it appears in the task list and logs.
ExecStart=/bin/sh -c '\
i=0; \
while [ $i -lt 90 ]; do \
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi; \
sleep 1; i=$((i+1)); \
done; \
sleep 60; \
curl -sf -X POST http://localhost/api/audit/run >/dev/null'
StandardOutput=journal StandardOutput=journal
StandardError=journal StandardError=journal
[Install]
WantedBy=multi-user.target

View File

@@ -12,17 +12,55 @@
set -euo pipefail set -euo pipefail
usage() {
cat >&2 <<'EOF'
Usage: bee-install <device> [logfile]
Installs the live system to a local disk (WIPES the target).
device Target block device, e.g. /dev/sda or /dev/nvme0n1
Must be a hard disk or NVMe — NOT a CD-ROM (/dev/sr*)
logfile Optional path for progress log (default: /tmp/bee-install.log)
Examples:
bee-install /dev/sda
bee-install /dev/nvme0n1
bee-install /dev/sdb /tmp/my-install.log
WARNING: ALL DATA ON <device> WILL BE ERASED.
Layout (UEFI): GPT — partition 1: EFI 512MB vfat, partition 2: root ext4
Layout (BIOS): MBR — partition 1: root ext4
EOF
exit 1
}
DEVICE="${1:-}" DEVICE="${1:-}"
LOGFILE="${2:-/tmp/bee-install.log}" LOGFILE="${2:-/tmp/bee-install.log}"
if [ -z "$DEVICE" ]; then if [ -z "$DEVICE" ] || [ "$DEVICE" = "--help" ] || [ "$DEVICE" = "-h" ]; then
echo "Usage: bee-install <device> [logfile]" >&2 usage
exit 1
fi fi
if [ ! -b "$DEVICE" ]; then if [ ! -b "$DEVICE" ]; then
echo "ERROR: $DEVICE is not a block device" >&2 echo "ERROR: $DEVICE is not a block device" >&2
echo "Run 'lsblk' to list available disks." >&2
exit 1 exit 1
fi fi
# Block CD-ROM devices
case "$DEVICE" in
/dev/sr*|/dev/scd*)
echo "ERROR: $DEVICE is a CD-ROM/optical device — cannot install to it." >&2
echo "Run 'lsblk' to find the target disk (e.g. /dev/sda, /dev/nvme0n1)." >&2
exit 1
;;
esac
# Check required tools
for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
if ! command -v "$tool" >/dev/null 2>&1; then
echo "ERROR: required tool not found: $tool" >&2
exit 1
fi
done
SQUASHFS="/run/live/medium/live/filesystem.squashfs" SQUASHFS="/run/live/medium/live/filesystem.squashfs"
if [ ! -f "$SQUASHFS" ]; then if [ ! -f "$SQUASHFS" ]; then

View File

@@ -23,6 +23,62 @@ contains_csv() {
echo ",${haystack}," | grep -q ",${needle}," echo ",${haystack}," | grep -q ",${needle},"
} }
show_opencl_diagnostics() {
if command -v clinfo >/dev/null 2>&1; then
echo "-- clinfo -l --" >&2
clinfo -l >&2 || true
fi
echo "-- john --list=opencl-devices --" >&2
./john --list=opencl-devices >&2 || true
}
ensure_nvidia_uvm() {
if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
return 0
fi
if [ "$(id -u)" != "0" ]; then
return 1
fi
ko="/usr/local/lib/nvidia/nvidia-uvm.ko"
[ -f "${ko}" ] || return 1
if ! insmod "${ko}" >/dev/null 2>&1; then
return 1
fi
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices | awk '{print $1}')
if [ -n "${uvm_major}" ]; then
mknod -m 666 /dev/nvidia-uvm c "${uvm_major}" 0 2>/dev/null || true
mknod -m 666 /dev/nvidia-uvm-tools c "${uvm_major}" 1 2>/dev/null || true
fi
return 0
}
ensure_opencl_ready() {
out=$(./john --list=opencl-devices 2>&1 || true)
if echo "${out}" | grep -q "Device #"; then
return 0
fi
if ensure_nvidia_uvm; then
out=$(./john --list=opencl-devices 2>&1 || true)
if echo "${out}" | grep -q "Device #"; then
return 0
fi
fi
echo "OpenCL devices are not available for John." >&2
if ! lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
echo "nvidia_uvm is not loaded." >&2
fi
if [ ! -e /dev/nvidia-uvm ]; then
echo "/dev/nvidia-uvm is missing." >&2
fi
show_opencl_diagnostics
return 1
}
while [ "$#" -gt 0 ]; do while [ "$#" -gt 0 ]; do
case "$1" in case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;; --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
@@ -76,6 +132,8 @@ echo "john_devices=${JOHN_DEVICES}"
cd "${JOHN_DIR}" cd "${JOHN_DIR}"
ensure_opencl_ready || exit 1
choose_format() { choose_format() {
if [ -n "${FORMAT}" ]; then if [ -n "${FORMAT}" ]; then
echo "${FORMAT}" echo "${FORMAT}"

View File

@@ -17,7 +17,7 @@ mkdir -p "$(dirname "$log_file")"
serial_sink() { serial_sink() {
local tty="$1" local tty="$1"
if [ -w "$tty" ]; then if [ -w "$tty" ]; then
cat > "$tty" cat > "$tty" 2>/dev/null || true
else else
cat > /dev/null cat > /dev/null
fi fi

View File

@@ -59,11 +59,24 @@ load_module() {
return 1 return 1
} }
load_host_module() {
mod="$1"
if modprobe "$mod" >/dev/null 2>&1; then
log "host module loaded: $mod"
return 0
fi
return 1
}
case "$nvidia_mode" in case "$nvidia_mode" in
normal|full) normal|full)
if ! load_module nvidia; then if ! load_module nvidia; then
exit 1 exit 1
fi fi
# nvidia-modeset on some server kernels needs ACPI video helper symbols
# exported by the generic "video" module. Best-effort only; compute paths
# remain functional even if display-related modules stay absent.
load_host_module video || true
load_module nvidia-modeset || true load_module nvidia-modeset || true
load_module nvidia-uvm || true load_module nvidia-uvm || true
;; ;;