Merge debug/prod into single ISO build, fix NVIDIA module loading

## ISO build consolidation
- Remove separate debug/prod split: overlay-debug/, build-debug.sh,
  mkimg.bee_debug.sh, genapkovl-bee_debug.sh all deleted
- Single overlay: iso/overlay/ (was overlay-debug content)
- Single build script: build.sh (SSH, TUI, NVIDIA, vendor tools, bee-release)
- Single mkimage profile: bee (with dropbear, dialog, strace, gcompat, etc.)

## NVIDIA fixes
- Modules now stored at /usr/local/lib/nvidia/ instead of
  /lib/modules/<kver>/extra/nvidia/ — modloop squashfs mounts over that
  path at boot making overlay content there inaccessible
- bee-nvidia init: load via insmod (absolute path), not modprobe
- bee-nvidia init: create libnvidia-ml.so.1/libcuda.so.1 symlinks in /usr/lib/
- build-nvidia-module.sh: always install linux-lts-dev (not conditional) —
  stale 6.6.x headers caused wrong-kernel modules that never loaded at runtime
- build-nvidia-module.sh: create soname symlinks in cache
- KERNEL_VERSION in VERSIONS updated 6.6 → 6.12
- gcompat added to ISO packages (nvidia-smi is a glibc binary on musl Alpine)

## Service ordering
- bee-audit: add `after bee-nvidia` so NVIDIA enrichment always succeeds

## New tooling
- iso/builder/smoketest.sh: SSH smoke test for post-boot ISO validation
- iso/builder/build-gpu-burn.sh: builds gpu_burn vendor binary (CUDA 12.8+)
- vendor/gpu_burn included automatically if placed in iso/vendor/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-06 20:14:18 +03:00
parent 0907ba07c3
commit 1768bb58dd
24 changed files with 1296 additions and 261 deletions

View File

@@ -1,4 +1,4 @@
ALPINE_VERSION=3.21
KERNEL_VERSION=6.6
KERNEL_VERSION=6.12
NVIDIA_DRIVER_VERSION=590.48.01
GO_VERSION=1.23.6

View File

@@ -0,0 +1,82 @@
#!/bin/sh
# build-gpu-burn.sh — build gpu_burn stress tool and output static-ish binary to DIST_DIR
#
# gpu_burn requires nvcc (CUDA toolkit). This script downloads a minimal CUDA toolkit
# runfile, extracts only nvcc + headers, builds gpu_burn, then cleans up the toolkit.
#
# Output: $DIST_DIR/gpu_burn (ready to copy into ISO vendor/)
#
# Usage: sh build-gpu-burn.sh <dist-dir>
set -e
DIST_DIR="$1"
[ -n "$DIST_DIR" ] || { echo "usage: $0 <dist-dir>"; exit 1; }
mkdir -p "$DIST_DIR"
OUTPUT="$DIST_DIR/gpu_burn"
if [ -f "$OUTPUT" ] && [ -s "$OUTPUT" ]; then
echo "=== gpu_burn cached: $OUTPUT ==="
exit 0
fi
# CUDA toolkit version for building — only nvcc + headers needed, not the full runtime.
# Must be <= max CUDA version supported by the NVIDIA driver in VERSIONS.
# Driver 590.48.01 supports up to CUDA 13.1; use 12.6 (stable, widely tested).
CUDA_VERSION="12.8.1"
CUDA_BUILD="570.124.06"
CUDA_RUN="/var/tmp/cuda-${CUDA_VERSION}.run"
CUDA_DIR="/var/tmp/cuda-toolkit-${CUDA_VERSION}"
echo "=== building gpu_burn (CUDA ${CUDA_VERSION}) ==="
# Install build dependencies
apk add --quiet gcc g++ make git wget libxml2
# Download CUDA toolkit runfile if not cached
if [ ! -s "$CUDA_RUN" ]; then
echo "=== downloading CUDA ${CUDA_VERSION} toolkit ==="
wget -q --show-progress -O "$CUDA_RUN" \
"https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUILD}_linux.run"
fi
# Extract toolkit (nvcc + headers only — skip driver, samples, docs to save time/space)
if [ ! -d "$CUDA_DIR/bin/nvcc" ] && [ ! -f "$CUDA_DIR/bin/nvcc" ]; then
echo "=== extracting CUDA toolkit ==="
rm -rf "$CUDA_DIR"
sh "$CUDA_RUN" \
--silent \
--toolkit \
--toolkitpath="$CUDA_DIR" \
--no-opengl-libs \
--no-drm \
--override 2>&1 | tail -5
fi
NVCC="$CUDA_DIR/bin/nvcc"
[ -f "$NVCC" ] || { echo "ERROR: nvcc not found after extraction: $NVCC"; exit 1; }
echo "nvcc: $("$NVCC" --version | head -1)"
# Clone gpu_burn source
GPU_BURN_DIR="/var/tmp/gpu-burn-src"
if [ ! -d "$GPU_BURN_DIR/.git" ]; then
echo "=== cloning gpu-burn ==="
git clone --depth=1 https://github.com/wilicc/gpu-burn.git "$GPU_BURN_DIR"
else
echo "=== gpu-burn source already cloned ==="
fi
# Build
echo "=== building gpu_burn ==="
cd "$GPU_BURN_DIR"
make clean 2>/dev/null || true
CUDA_PATH="$CUDA_DIR" make 2>&1
[ -f "$GPU_BURN_DIR/gpu_burn" ] || { echo "ERROR: gpu_burn binary not produced"; exit 1; }
cp "$GPU_BURN_DIR/gpu_burn" "$OUTPUT"
cp "$GPU_BURN_DIR/compare.ptx" "$(dirname "$OUTPUT")/compare.ptx" 2>/dev/null || true
echo "=== gpu_burn build complete ==="
ls -lh "$OUTPUT"
echo "NOTE: compare.ptx must be present in same dir as gpu_burn at runtime"

View File

@@ -20,6 +20,11 @@ DIST_DIR="$2"
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir>"; exit 1; }
# Always install linux-lts-dev to ensure headers match the ISO's kernel (Alpine 3.21 = 6.12.x).
# Without this, a builder with stale 6.6.x headers produces modules for the wrong kernel version.
echo "=== installing linux-lts-dev ==="
apk add --quiet linux-lts-dev
# Detect kernel version from installed headers (pick highest version if multiple).
detect_kver() {
ls /usr/src/ 2>/dev/null \
@@ -30,11 +35,6 @@ detect_kver() {
}
KVER="$(detect_kver)"
if [ -z "$KVER" ]; then
echo "=== installing linux-lts-dev ==="
apk add --quiet linux-lts-dev
KVER="$(detect_kver)"
fi
KDIR="/usr/src/linux-headers-${KVER}"
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
@@ -107,6 +107,15 @@ cp "$EXTRACT_DIR/libnvidia-ml.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
# libcuda stub needed by nvidia-smi at runtime
cp "$EXTRACT_DIR/libcuda.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat)
for lib in libnvidia-ml libcuda; do
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9]* 2>/dev/null | head -1)
[ -n "$versioned" ] || continue
base=$(basename "$versioned")
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1" 2>/dev/null || true
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
done
echo "=== NVIDIA build complete ==="
echo "cache: $CACHE_DIR"
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko | wc -l) .ko files"

View File

@@ -1,5 +1,11 @@
#!/bin/sh
# build.sh — production ISO build (unattended mode)
# build.sh — build bee ISO
#
# Single build script. Produces a bootable live ISO with SSH access, TUI, NVIDIA drivers.
#
# Run on Alpine builder VM as root after setup-builder.sh.
# Usage:
# sh iso/builder/build.sh [--authorized-keys /path/to/authorized_keys]
set -e
@@ -8,29 +14,92 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
DIST_DIR="${REPO_ROOT}/dist"
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
AUTH_KEYS=""
# parse args
while [ $# -gt 0 ]; do
case "$1" in
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
*) echo "unknown arg: $1"; exit 1 ;;
esac
done
. "${BUILDER_DIR}/VERSIONS"
export PATH="$PATH:/usr/local/go/bin"
echo "=== bee production ISO build ==="
echo "Alpine: ${ALPINE_VERSION}, Go: ${GO_VERSION}, NVIDIA: ${NVIDIA_DRIVER_VERSION}"
# NOTE: lz4 compression for modloop is disabled — Alpine initramfs may not support lz4 squashfs.
# Default xz compression is used until lz4 support is confirmed.
echo "=== bee ISO build ==="
echo "Alpine: ${ALPINE_VERSION}, Go: ${GO_VERSION}"
echo ""
# --- compile audit binary (static, Linux amd64) ---
# Skip rebuild if binary is newer than all Go source files.
AUDIT_BIN="${DIST_DIR}/bee-audit-linux-amd64"
mkdir -p "$DIST_DIR"
NEED_BUILD=1
if [ -f "$AUDIT_BIN" ]; then
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$AUDIT_BIN" | head -1)
[ -z "$NEWEST_SRC" ] && NEED_BUILD=0
fi
cd "${REPO_ROOT}/audit"
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
go build \
-ldflags "-s -w -X main.Version=${AUDIT_VERSION}" \
-o "$AUDIT_BIN" \
./cmd/audit
if [ "$NEED_BUILD" = "1" ]; then
echo "=== building audit binary ==="
cd "${REPO_ROOT}/audit"
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
go build \
-ldflags "-s -w -X main.Version=${AUDIT_VERSION:-$(date +%Y%m%d)}" \
-o "$AUDIT_BIN" \
./cmd/audit
echo "binary: $AUDIT_BIN"
echo "size: $(du -sh "$AUDIT_BIN" | cut -f1)"
else
echo "=== audit binary up to date, skipping build ==="
fi
# --- inject authorized_keys for SSH access ---
# Uses the same Ed25519 keys as release signing (from git.mchus.pro/mchus/keys).
# SSH public keys are stored alongside signing keys as ~/.keys/<name>.key.pub
AUTHORIZED_KEYS_FILE="${OVERLAY_DIR}/root/.ssh/authorized_keys"
mkdir -p "${OVERLAY_DIR}/root/.ssh"
if [ -n "$AUTH_KEYS" ]; then
cp "$AUTH_KEYS" "$AUTHORIZED_KEYS_FILE"
chmod 600 "$AUTHORIZED_KEYS_FILE"
echo "SSH authorized_keys: installed from $AUTH_KEYS"
else
# auto-collect all developer SSH public keys from ~/.keys/*.key.pub
> "$AUTHORIZED_KEYS_FILE"
FOUND=0
for ssh_pub in "$HOME"/.keys/*.key.pub; do
[ -f "$ssh_pub" ] || continue
cat "$ssh_pub" >> "$AUTHORIZED_KEYS_FILE"
echo "SSH: added $(basename "$ssh_pub" .key.pub)"
FOUND=$((FOUND + 1))
done
if [ "$FOUND" -gt 0 ]; then
chmod 600 "$AUTHORIZED_KEYS_FILE"
echo "SSH authorized_keys: $FOUND key(s) from ~/.keys/*.key.pub"
else
echo "WARNING: no SSH public keys found — falling back to password auth"
echo " root password will be set to: bee / eeb"
echo " (generate a key with: sh keys/scripts/keygen.sh <your-name>)"
USE_PASSWORD_FALLBACK=1
fi
fi
# --- password fallback: write marker file read by init script ---
if [ "${USE_PASSWORD_FALLBACK:-0}" = "1" ]; then
touch "${OVERLAY_DIR}/etc/bee-ssh-password-fallback"
fi
# --- copy audit binary into overlay ---
mkdir -p "${OVERLAY_DIR}/usr/local/bin"
cp "$AUDIT_BIN" "${OVERLAY_DIR}/usr/local/bin/audit"
cp "${DIST_DIR}/bee-audit-linux-amd64" "${OVERLAY_DIR}/usr/local/bin/audit"
chmod +x "${OVERLAY_DIR}/usr/local/bin/audit"
# Copy optional vendor utilities if already fetched.
for tool in storcli64 sas2ircu sas3ircu mstflint; do
# --- vendor utilities (optional pre-fetched binaries) ---
for tool in storcli64 sas2ircu sas3ircu mstflint gpu_burn; do
if [ -f "${VENDOR_DIR}/${tool}" ]; then
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_DIR}/usr/local/bin/${tool}"
chmod +x "${OVERLAY_DIR}/usr/local/bin/${tool}" || true
@@ -40,21 +109,29 @@ for tool in storcli64 sas2ircu sas3ircu mstflint; do
fi
done
# Build and inject NVIDIA proprietary modules + userspace tools.
echo "=== building NVIDIA modules ==="
# --- build NVIDIA kernel modules and inject into overlay ---
echo ""
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}"
KVER="$(ls /usr/src/ 2>/dev/null | grep '^linux-headers-' | sed 's/linux-headers-//' | sort -V | tail -1)"
# Determine kernel version (same as what goes into the ISO — both use linux-lts from same Alpine)
KVER=$(ls /usr/src/ 2>/dev/null | grep '^linux-headers-' | sed 's/linux-headers-//' | sort -V | tail -1)
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
mkdir -p "${OVERLAY_DIR}/lib/modules/${KVER}/extra/nvidia"
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_DIR}/lib/modules/${KVER}/extra/nvidia/"
# Inject .ko files into overlay at /usr/local/lib/nvidia/ (not /lib/modules/ — modloop squashfs
# mounts over that path at boot and makes it read-only, so overlay content there is inaccessible)
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
mkdir -p "${OVERLAY_KMOD_DIR}"
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
# Inject nvidia-smi and libnvidia-ml
mkdir -p "${OVERLAY_DIR}/usr/local/bin" "${OVERLAY_DIR}/usr/lib"
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_DIR}/usr/local/bin/"
chmod +x "${OVERLAY_DIR}/usr/local/bin/nvidia-smi"
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_DIR}/usr/lib/" 2>/dev/null || true
# Embed build metadata used at runtime.
# --- embed build metadata ---
mkdir -p "${OVERLAY_DIR}/etc"
BUILD_DATE="$(date +%Y-%m-%d)"
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
@@ -67,12 +144,27 @@ ALPINE_VERSION=${ALPINE_VERSION}
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
EOF
# --- export build info for genapkovl to inject into motd ---
BUILD_DATE=$(date +%Y-%m-%d)
GIT_COMMIT=$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo "unknown")
export BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} alpine:${ALPINE_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
# --- build ISO using mkimage ---
mkdir -p "${DIST_DIR}"
echo ""
echo "=== building ISO ==="
# Install our mkimage profile where mkimage.sh can find it.
# ~/.mkimage is the user plugin directory loaded by mkimage.sh.
mkdir -p "${HOME}/.mkimage"
cp "${BUILDER_DIR}/mkimg.bee.sh" "${HOME}/.mkimage/"
cp "${BUILDER_DIR}/genapkovl-bee.sh" "${HOME}/.mkimage/"
# Export overlay dir so the profile script can find it regardless of SRCDIR.
export BEE_OVERLAY_DIR="${OVERLAY_DIR}"
# Clean workdir selectively: remove everything except apks cache so packages aren't re-downloaded.
# mkimage stores each section in a hash-named subdir; apks_* dirs contain downloaded packages.
if [ -d /var/tmp/bee-iso-work ]; then
find /var/tmp/bee-iso-work -maxdepth 1 -mindepth 1 \
-not -name 'apks_*' -not -name 'kernel_*' \
@@ -80,6 +172,9 @@ if [ -d /var/tmp/bee-iso-work ]; then
-exec rm -rf {} + 2>/dev/null || true
fi
# Run from /var/tmp to avoid git repo context conflicts and to ensure enough scratch space.
# mkinitfs/update-kernel use TMPDIR for initramfs build; tmpfs /tmp is only ~1GB.
# mkimage.sh sources genapkovl-*.sh from CWD (not from ~/.mkimage), so copy it here too.
export TMPDIR=/var/tmp
cp "${BUILDER_DIR}/genapkovl-bee.sh" /var/tmp/
cd /var/tmp
@@ -93,5 +188,9 @@ sh /usr/share/aports/scripts/mkimage.sh \
--profile bee
ISO="${DIST_DIR}/alpine-bee-${ALPINE_VERSION}-x86_64.iso"
echo ""
echo "=== done ==="
echo "ISO: $ISO"
echo "Size: $(du -sh "$ISO" 2>/dev/null | cut -f1 || echo 'not found')"
echo ""
echo "Boot via BMC virtual media and SSH to the server IP on port 22 as root."

View File

@@ -12,18 +12,19 @@ makefile() { OWNER="$1" PERMS="$2" FILENAME="$3"; cat > "$FILENAME"; chown "$OWN
rc_add() { mkdir -p "$tmp/etc/runlevels/$2"; ln -sf /etc/init.d/"$1" "$tmp/etc/runlevels/$2/$1"; }
mkdir -p "$tmp/etc"
makefile root:root 0644 "$tmp/etc/hostname" <<EOT
makefile root:root 0644 "$tmp/etc/hostname" <<EOF
$HOSTNAME
EOT
EOF
# Empty interfaces file — prevents ifupdown from erroring, bee-network handles DHCP
mkdir -p "$tmp/etc/network"
makefile root:root 0644 "$tmp/etc/network/interfaces" <<EOT
makefile root:root 0644 "$tmp/etc/network/interfaces" <<EOF
auto lo
iface lo inet loopback
EOT
EOF
mkdir -p "$tmp/etc/apk"
makefile root:root 0644 "$tmp/etc/apk/world" <<EOT
makefile root:root 0644 "$tmp/etc/apk/world" <<EOF
alpine-base
dmidecode
smartmontools
@@ -34,12 +35,18 @@ util-linux
lsblk
e2fsprogs
lshw
openrc
ca-certificates
dropbear
libqrencode-tools
tzdata
jq
wget
EOT
ca-certificates
strace
procps
lsof
file
less
vim
dialog
EOF
rc_add devfs sysinit
rc_add dmesg sysinit
@@ -58,14 +65,16 @@ rc_add mount-ro shutdown
rc_add killprocs shutdown
rc_add savecache shutdown
rc_add bee-sshsetup default
rc_add bee-network default
rc_add bee-update default
rc_add dropbear default
rc_add bee-nvidia default
rc_add bee-audit default
rc_add bee-audit-debug default
if [ -d "$OVERLAY/etc" ]; then
cp -r "$OVERLAY/etc/." "$tmp/etc/"
chmod +x "$tmp/etc/init.d/"* 2>/dev/null || true
[ -n "$BEE_BUILD_INFO" ] && sed -i "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "$tmp/etc/motd" 2>/dev/null || true
fi
mkdir -p "$tmp/usr"
@@ -74,9 +83,24 @@ if [ -d "$OVERLAY/usr" ]; then
chmod +x "$tmp/usr/local/bin/"* 2>/dev/null || true
fi
if [ -d "$OVERLAY/root" ]; then
mkdir -p "$tmp/root"
cp -r "$OVERLAY/root/." "$tmp/root/"
chmod 700 "$tmp/root/.ssh" 2>/dev/null || true
chmod 600 "$tmp/root/.ssh/authorized_keys" 2>/dev/null || true
fi
if [ -d "$OVERLAY/lib" ]; then
mkdir -p "$tmp/lib"
cp -r "$OVERLAY/lib/." "$tmp/lib/"
fi
tar -c -C "$tmp" etc usr lib 2>/dev/null | gzip -9n > "$HOSTNAME.apkovl.tar.gz"
mkdir -p "$tmp/etc/dropbear" "$tmp/etc/conf.d"
# -R: auto-generate host keys if missing
# no dependency on networking service — bee-network handles DHCP independently
makefile root:root 0644 "$tmp/etc/conf.d/dropbear" <<EOF
DROPBEAR_OPTS="-R -B"
EOF
tar -c -C "$tmp" etc usr root lib 2>/dev/null | gzip -9n > "$HOSTNAME.apkovl.tar.gz"

View File

@@ -1,9 +1,9 @@
#!/bin/sh
# Alpine mkimage profile: bee (production)
# Alpine mkimage profile: bee
profile_bee() {
title="Bee Hardware Audit"
desc="Hardware audit LiveCD (production unattended mode)"
desc="Hardware audit LiveCD"
arch="x86_64"
hostname="alpine-bee"
apkovl="genapkovl-bee.sh"
@@ -13,7 +13,8 @@ profile_bee() {
kernel_addons=""
initfs_cmdline="modules=loop,squashfs,sd-mod,usb-storage modloop=/boot/modloop-lts quiet"
initfs_features="ata base cdrom ext4 mmc nvme raid scsi squashfs usb virtio nfit"
grub_mod="all_video disk part_gpt part_msdos linux normal configfile search search_label efi_gop fat iso9660 cat echo ls test true help gzio multiboot2 efi_uga"
syslinux_serial="0 115200"
apks="
alpine-base
linux-lts
@@ -38,10 +39,20 @@ profile_bee() {
e2fsprogs
lshw
dropbear
openrc
ca-certificates
libqrencode-tools
tzdata
jq
wget
ca-certificates
strace
procps
lsof
file
less
vim
dialog
gcompat
"
}

176
iso/builder/smoketest.sh Normal file
View File

@@ -0,0 +1,176 @@
#!/bin/sh
# smoketest.sh — run on a live ISO via SSH to verify all critical components.
#
# Usage:
# ssh root@<ip> 'sh -s' < smoketest.sh
# or: scp smoketest.sh root@<ip>:/var/tmp/ && ssh root@<ip> sh /var/tmp/smoketest.sh
#
# Exit code: 0 = all required checks passed, 1 = at least one required check failed.
PASS=0
FAIL=0
WARN=0
ok() { echo "[ OK ] $*"; PASS=$((PASS+1)); }
fail() { echo "[ FAIL ] $*"; FAIL=$((FAIL+1)); }
warn() { echo "[ WARN ] $*"; WARN=$((WARN+1)); }
info() { echo "[ INFO ] $*"; }
echo "========================================"
echo " bee live ISO smoketest"
echo " host: $(uname -n)"
echo " kernel: $(uname -r)"
echo " date: $(date -u)"
echo "========================================"
echo ""
# --- kernel version ---
KVER=$(uname -r)
info "kernel: $KVER"
# --- PATH ---
echo "-- PATH & binaries --"
for tool in dmidecode smartctl nvme ipmitool lspci audit; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
else
fail "$tool: NOT FOUND"
fi
done
for tool in nvidia-smi gpu_burn; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
else
warn "$tool: NOT FOUND (optional but expected)"
fi
done
echo ""
echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia"
if [ -d "$KO_DIR" ]; then
ko_count=$(ls "$KO_DIR"/*.ko 2>/dev/null | wc -l)
ok "NVIDIA ko dir exists: $KO_DIR ($ko_count .ko files)"
else
fail "NVIDIA ko dir missing: $KO_DIR"
fi
for mod in nvidia nvidia_modeset nvidia_uvm; do
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
ok "module loaded: $mod"
else
fail "module NOT loaded: $mod"
fi
done
echo ""
echo "-- nvidia-smi --"
if PATH="/usr/local/bin:$PATH" command -v nvidia-smi >/dev/null 2>&1; then
if PATH="/usr/local/bin:$PATH" nvidia-smi -L 2>/dev/null | grep -q "GPU"; then
gpu_count=$(PATH="/usr/local/bin:$PATH" nvidia-smi -L 2>/dev/null | grep -c "GPU")
ok "nvidia-smi: $gpu_count GPU(s) found"
else
fail "nvidia-smi: runs but no GPUs detected"
fi
else
fail "nvidia-smi: not found in PATH"
fi
echo ""
echo "-- lib symlinks --"
for lib in libnvidia-ml libcuda; do
if [ -f "/usr/lib/${lib}.so.1" ] || [ -L "/usr/lib/${lib}.so.1" ]; then
ok "/usr/lib/${lib}.so.1 exists"
else
fail "/usr/lib/${lib}.so.1 MISSING (nvidia-smi will fail)"
fi
done
echo ""
echo "-- gcompat (glibc compat for nvidia-smi) --"
if [ -L /lib64/ld-linux-x86-64.so.2 ] || [ -f /lib64/ld-linux-x86-64.so.2 ]; then
ok "gcompat: /lib64/ld-linux-x86-64.so.2 present"
else
fail "gcompat: /lib64/ld-linux-x86-64.so.2 MISSING — nvidia-smi will fail to exec"
fi
echo ""
echo "-- openrc services --"
for svc in bee-nvidia bee-network; do
if rc-service "$svc" status >/dev/null 2>&1; then
ok "service running: $svc"
else
fail "service NOT running: $svc"
fi
done
for svc in bee-audit-debug dropbear bee-sshsetup; do
if [ -f "/etc/init.d/$svc" ]; then
if rc-service "$svc" status >/dev/null 2>&1; then
ok "service running: $svc"
else
warn "service not running: $svc (may be one-shot)"
fi
fi
done
echo ""
echo "-- audit binary --"
AUDIT=/usr/local/bin/audit
if [ -x "$AUDIT" ]; then
ok "audit binary: present"
ver=$("$AUDIT" --version 2>/dev/null || "$AUDIT" version 2>/dev/null || echo "unknown")
info "audit version: $ver"
else
fail "audit binary: NOT FOUND at $AUDIT"
fi
echo ""
echo "-- audit last run --"
if [ -f /var/log/bee-audit.log ]; then
last_line=$(tail -1 /var/log/bee-audit.log)
info "last log line: $last_line"
if grep -q "audit completed" /var/log/bee-audit.log 2>/dev/null; then
ok "audit: completed successfully"
elif grep -q "audit started" /var/log/bee-audit.log 2>/dev/null; then
warn "audit: started but may not have completed"
fi
# check for nvidia enrichment
if grep -q "nvidia: enrichment skipped" /var/log/bee-audit.log 2>/dev/null; then
reason=$(grep "nvidia: enrichment skipped" /var/log/bee-audit.log | tail -1)
fail "audit: nvidia enrichment skipped — $reason"
else
ok "audit: nvidia enrichment OK"
fi
else
warn "audit: no log found at /var/log/bee-audit.log"
fi
echo ""
echo "-- network --"
if ip route show default 2>/dev/null | grep -q "default"; then
gw=$(ip route show default | awk '{print $3}' | head -1)
ok "default route: $gw"
else
fail "no default route"
fi
if ping -c1 -W3 1.1.1.1 >/dev/null 2>&1; then
ok "internet: reachable (1.1.1.1)"
else
fail "internet: unreachable"
fi
echo ""
echo "-- /etc/profile.d/bee.sh PATH --"
if grep -q "/usr/local/bin" /etc/profile.d/bee.sh 2>/dev/null; then
ok "/etc/profile.d/bee.sh exports /usr/local/bin"
else
fail "/etc/profile.d/bee.sh does not add /usr/local/bin to PATH"
fi
echo ""
echo "========================================"
echo " Results: OK=$PASS FAIL=$FAIL WARN=$WARN"
echo "========================================"
[ "$FAIL" -eq 0 ] && exit 0 || exit 1