Merge debug/prod into single ISO build, fix NVIDIA module loading
## ISO build consolidation - Remove separate debug/prod split: overlay-debug/, build-debug.sh, mkimg.bee_debug.sh, genapkovl-bee_debug.sh all deleted - Single overlay: iso/overlay/ (was overlay-debug content) - Single build script: build.sh (SSH, TUI, NVIDIA, vendor tools, bee-release) - Single mkimage profile: bee (with dropbear, dialog, strace, gcompat, etc.) ## NVIDIA fixes - Modules now stored at /usr/local/lib/nvidia/ instead of /lib/modules/<kver>/extra/nvidia/ — modloop squashfs mounts over that path at boot making overlay content there inaccessible - bee-nvidia init: load via insmod (absolute path), not modprobe - bee-nvidia init: create libnvidia-ml.so.1/libcuda.so.1 symlinks in /usr/lib/ - build-nvidia-module.sh: always install linux-lts-dev (not conditional) — stale 6.6.x headers caused wrong-kernel modules that never loaded at runtime - build-nvidia-module.sh: create soname symlinks in cache - KERNEL_VERSION in VERSIONS updated 6.6 → 6.12 - gcompat added to ISO packages (nvidia-smi is a glibc binary on musl Alpine) ## Service ordering - bee-audit: add `after bee-nvidia` so NVIDIA enrichment always succeeds ## New tooling - iso/builder/smoketest.sh: SSH smoke test for post-boot ISO validation - iso/builder/build-gpu-burn.sh: builds gpu_burn vendor binary (CUDA 12.8+) - vendor/gpu_burn included automatically if placed in iso/vendor/ Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
176
iso/builder/smoketest.sh
Normal file
176
iso/builder/smoketest.sh
Normal file
@@ -0,0 +1,176 @@
|
||||
#!/bin/sh
|
||||
# smoketest.sh — run on a live ISO via SSH to verify all critical components.
|
||||
#
|
||||
# Usage:
|
||||
# ssh root@<ip> 'sh -s' < smoketest.sh
|
||||
# or: scp smoketest.sh root@<ip>:/var/tmp/ && ssh root@<ip> sh /var/tmp/smoketest.sh
|
||||
#
|
||||
# Exit code: 0 = all required checks passed, 1 = at least one required check failed.
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
WARN=0
|
||||
|
||||
ok() { echo "[ OK ] $*"; PASS=$((PASS+1)); }
|
||||
fail() { echo "[ FAIL ] $*"; FAIL=$((FAIL+1)); }
|
||||
warn() { echo "[ WARN ] $*"; WARN=$((WARN+1)); }
|
||||
info() { echo "[ INFO ] $*"; }
|
||||
|
||||
echo "========================================"
|
||||
echo " bee live ISO smoketest"
|
||||
echo " host: $(uname -n)"
|
||||
echo " kernel: $(uname -r)"
|
||||
echo " date: $(date -u)"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
|
||||
# --- kernel version ---
|
||||
KVER=$(uname -r)
|
||||
info "kernel: $KVER"
|
||||
|
||||
# --- PATH ---
|
||||
echo "-- PATH & binaries --"
|
||||
for tool in dmidecode smartctl nvme ipmitool lspci audit; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
else
|
||||
fail "$tool: NOT FOUND"
|
||||
fi
|
||||
done
|
||||
|
||||
for tool in nvidia-smi gpu_burn; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
else
|
||||
warn "$tool: NOT FOUND (optional but expected)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "-- NVIDIA modules --"
|
||||
KO_DIR="/usr/local/lib/nvidia"
|
||||
if [ -d "$KO_DIR" ]; then
|
||||
ko_count=$(ls "$KO_DIR"/*.ko 2>/dev/null | wc -l)
|
||||
ok "NVIDIA ko dir exists: $KO_DIR ($ko_count .ko files)"
|
||||
else
|
||||
fail "NVIDIA ko dir missing: $KO_DIR"
|
||||
fi
|
||||
|
||||
for mod in nvidia nvidia_modeset nvidia_uvm; do
|
||||
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
|
||||
ok "module loaded: $mod"
|
||||
else
|
||||
fail "module NOT loaded: $mod"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "-- nvidia-smi --"
|
||||
if PATH="/usr/local/bin:$PATH" command -v nvidia-smi >/dev/null 2>&1; then
|
||||
if PATH="/usr/local/bin:$PATH" nvidia-smi -L 2>/dev/null | grep -q "GPU"; then
|
||||
gpu_count=$(PATH="/usr/local/bin:$PATH" nvidia-smi -L 2>/dev/null | grep -c "GPU")
|
||||
ok "nvidia-smi: $gpu_count GPU(s) found"
|
||||
else
|
||||
fail "nvidia-smi: runs but no GPUs detected"
|
||||
fi
|
||||
else
|
||||
fail "nvidia-smi: not found in PATH"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- lib symlinks --"
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
if [ -f "/usr/lib/${lib}.so.1" ] || [ -L "/usr/lib/${lib}.so.1" ]; then
|
||||
ok "/usr/lib/${lib}.so.1 exists"
|
||||
else
|
||||
fail "/usr/lib/${lib}.so.1 MISSING (nvidia-smi will fail)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "-- gcompat (glibc compat for nvidia-smi) --"
|
||||
if [ -L /lib64/ld-linux-x86-64.so.2 ] || [ -f /lib64/ld-linux-x86-64.so.2 ]; then
|
||||
ok "gcompat: /lib64/ld-linux-x86-64.so.2 present"
|
||||
else
|
||||
fail "gcompat: /lib64/ld-linux-x86-64.so.2 MISSING — nvidia-smi will fail to exec"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- openrc services --"
|
||||
for svc in bee-nvidia bee-network; do
|
||||
if rc-service "$svc" status >/dev/null 2>&1; then
|
||||
ok "service running: $svc"
|
||||
else
|
||||
fail "service NOT running: $svc"
|
||||
fi
|
||||
done
|
||||
|
||||
for svc in bee-audit-debug dropbear bee-sshsetup; do
|
||||
if [ -f "/etc/init.d/$svc" ]; then
|
||||
if rc-service "$svc" status >/dev/null 2>&1; then
|
||||
ok "service running: $svc"
|
||||
else
|
||||
warn "service not running: $svc (may be one-shot)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "-- audit binary --"
|
||||
AUDIT=/usr/local/bin/audit
|
||||
if [ -x "$AUDIT" ]; then
|
||||
ok "audit binary: present"
|
||||
ver=$("$AUDIT" --version 2>/dev/null || "$AUDIT" version 2>/dev/null || echo "unknown")
|
||||
info "audit version: $ver"
|
||||
else
|
||||
fail "audit binary: NOT FOUND at $AUDIT"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- audit last run --"
|
||||
if [ -f /var/log/bee-audit.log ]; then
|
||||
last_line=$(tail -1 /var/log/bee-audit.log)
|
||||
info "last log line: $last_line"
|
||||
if grep -q "audit completed" /var/log/bee-audit.log 2>/dev/null; then
|
||||
ok "audit: completed successfully"
|
||||
elif grep -q "audit started" /var/log/bee-audit.log 2>/dev/null; then
|
||||
warn "audit: started but may not have completed"
|
||||
fi
|
||||
# check for nvidia enrichment
|
||||
if grep -q "nvidia: enrichment skipped" /var/log/bee-audit.log 2>/dev/null; then
|
||||
reason=$(grep "nvidia: enrichment skipped" /var/log/bee-audit.log | tail -1)
|
||||
fail "audit: nvidia enrichment skipped — $reason"
|
||||
else
|
||||
ok "audit: nvidia enrichment OK"
|
||||
fi
|
||||
else
|
||||
warn "audit: no log found at /var/log/bee-audit.log"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- network --"
|
||||
if ip route show default 2>/dev/null | grep -q "default"; then
|
||||
gw=$(ip route show default | awk '{print $3}' | head -1)
|
||||
ok "default route: $gw"
|
||||
else
|
||||
fail "no default route"
|
||||
fi
|
||||
if ping -c1 -W3 1.1.1.1 >/dev/null 2>&1; then
|
||||
ok "internet: reachable (1.1.1.1)"
|
||||
else
|
||||
fail "internet: unreachable"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- /etc/profile.d/bee.sh PATH --"
|
||||
if grep -q "/usr/local/bin" /etc/profile.d/bee.sh 2>/dev/null; then
|
||||
ok "/etc/profile.d/bee.sh exports /usr/local/bin"
|
||||
else
|
||||
fail "/etc/profile.d/bee.sh does not add /usr/local/bin to PATH"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "========================================"
|
||||
echo " Results: OK=$PASS FAIL=$FAIL WARN=$WARN"
|
||||
echo "========================================"
|
||||
[ "$FAIL" -eq 0 ] && exit 0 || exit 1
|
||||
Reference in New Issue
Block a user