feat: build NVIDIA open kernel modules during ISO build

- build-nvidia-module.sh: downloads nvidia open-gpu-kernel-modules source,
  builds against linux-lts headers, extracts nvidia-smi from .run installer
- modules cached by driver version + kernel version (rebuild only on update)
- .ko files injected into ISO overlay at /lib/modules/<kver>/extra/nvidia/
- bee-nvidia init script loads nvidia/nvidia-modeset/nvidia-uvm at boot
- NVIDIA_DRIVER_VERSION=550.54.15 (Turing+, H100/A100 supported)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-05 18:01:11 +03:00
parent 5475a0aa77
commit ec9c65e20e
4 changed files with 141 additions and 2 deletions

View File

@@ -2,8 +2,7 @@
# build-debug.sh — build bee debug ISO with SSH access
#
# Debug ISO purpose: test audit binary on real hardware.
# Includes dropbear SSH, all audit packages, audit binary.
# Does NOT include NVIDIA driver (added in production build).
# Includes dropbear SSH, all audit packages, audit binary, NVIDIA open kernel modules.
#
# Run on Alpine builder VM as root after setup-builder.sh.
# Usage:
@@ -101,6 +100,26 @@ mkdir -p "${OVERLAY_DIR}/usr/local/bin"
cp "${DIST_DIR}/bee-audit-linux-amd64" "${OVERLAY_DIR}/usr/local/bin/audit"
chmod +x "${OVERLAY_DIR}/usr/local/bin/audit"
# --- build NVIDIA kernel modules and inject into overlay ---
echo ""
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}"
# Determine kernel version (same as what goes into the ISO — both use linux-lts from same Alpine)
KVER=$(ls /usr/src/ 2>/dev/null | grep '^linux-headers-' | sed 's/linux-headers-//' | head -1)
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
# Inject .ko files into overlay at /lib/modules/<kver>/extra/nvidia/
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/lib/modules/${KVER}/extra/nvidia"
mkdir -p "${OVERLAY_KMOD_DIR}"
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
# Inject nvidia-smi and libnvidia-ml
mkdir -p "${OVERLAY_DIR}/usr/local/bin" "${OVERLAY_DIR}/usr/lib"
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_DIR}/usr/local/bin/"
chmod +x "${OVERLAY_DIR}/usr/local/bin/nvidia-smi"
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_DIR}/usr/lib/" 2>/dev/null || true
# --- build ISO using mkimage ---
mkdir -p "${DIST_DIR}"
echo ""

View File

@@ -0,0 +1,96 @@
#!/bin/sh
# build-nvidia-module.sh — build NVIDIA open kernel modules and extract nvidia-smi
#
# Builds NVIDIA open-gpu-kernel-modules from source against the installed linux-lts
# kernel headers. Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so
# subsequent builds are instant unless NVIDIA_DRIVER_VERSION or kernel changes.
#
# Output layout:
# $CACHE_DIR/modules/ — nvidia*.ko files (stripped)
# $CACHE_DIR/bin/ — nvidia-smi
# $CACHE_DIR/lib/ — libnvidia-ml.so.1, libcuda stub (for nvidia-smi)
set -e
NVIDIA_VERSION="$1"
DIST_DIR="$2"
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir>"; exit 1; }
# Detect kernel version from installed headers
KVER=$(ls /usr/src/ 2>/dev/null | grep '^linux-headers-' | sed 's/linux-headers-//' | head -1)
if [ -z "$KVER" ]; then
echo "=== installing linux-lts-dev ==="
apk add --quiet linux-lts-dev
KVER=$(ls /usr/src/ | grep '^linux-headers-' | sed 's/linux-headers-//' | head -1)
fi
KDIR="/usr/src/linux-headers-${KVER}"
echo "=== NVIDIA ${NVIDIA_VERSION} for kernel ${KVER} ==="
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
echo "=== NVIDIA modules cached, skipping build ==="
echo "cache: $CACHE_DIR"
echo "modules: $(ls $CACHE_DIR/modules/*.ko 2>/dev/null | wc -l) .ko files"
exit 0
fi
# Install build dependencies
echo "=== installing build deps ==="
apk add --quiet gcc make perl linux-lts-dev wget tar
# Download and build open kernel modules
BUILD_TMP="/var/tmp/nvidia-build"
rm -rf "$BUILD_TMP"
mkdir -p "$BUILD_TMP"
SRC_TGZ="/var/tmp/nvidia-open-${NVIDIA_VERSION}.tar.gz"
if [ ! -f "$SRC_TGZ" ]; then
echo "=== downloading NVIDIA open kernel modules source ==="
wget -q -O "$SRC_TGZ" \
"https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${NVIDIA_VERSION}.tar.gz"
fi
echo "=== extracting source ==="
tar -xzf "$SRC_TGZ" -C "$BUILD_TMP"
SRC_DIR="$BUILD_TMP/open-gpu-kernel-modules-${NVIDIA_VERSION}"
echo "=== building kernel modules ($(nproc) cores) ==="
cd "$SRC_DIR"
make -j$(nproc) \
KERNEL_SOURCE_PATH="$KDIR" \
IGNORE_MISSING_MODULE_SYMVERS=1 \
modules 2>&1 | tail -5
# Collect .ko files
mkdir -p "$CACHE_DIR/modules"
find "$SRC_DIR" -name '*.ko' -exec cp {} "$CACHE_DIR/modules/" \;
# Strip debug info from modules to reduce size
for ko in "$CACHE_DIR"/modules/*.ko; do
strip --strip-debug "$ko" 2>/dev/null || true
done
echo "modules: $(ls $CACHE_DIR/modules/*.ko | wc -l) .ko files"
# Extract nvidia-smi and required libraries from the .run installer
RUN_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
if [ ! -f "$RUN_FILE" ]; then
echo "=== downloading NVIDIA installer (for nvidia-smi) ==="
wget -q -O "$RUN_FILE" \
"https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
fi
echo "=== extracting nvidia-smi ==="
chmod +x "$RUN_FILE"
EXTRACT_DIR="/var/tmp/nvidia-extract-${NVIDIA_VERSION}"
rm -rf "$EXTRACT_DIR"
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR" 2>/dev/null || true
mkdir -p "$CACHE_DIR/bin" "$CACHE_DIR/lib"
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
cp "$EXTRACT_DIR/libnvidia-ml.so.1" "$CACHE_DIR/lib/" 2>/dev/null || true
cp "$EXTRACT_DIR/libnvidia-ml.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
echo "=== NVIDIA build complete ==="
echo "cache: $CACHE_DIR"
ls -lh "$CACHE_DIR/bin/" "$CACHE_DIR/modules/"

View File

@@ -67,6 +67,7 @@ rc_add savecache shutdown
rc_add bee-sshsetup default
rc_add bee-network default
rc_add dropbear default
rc_add bee-nvidia default
rc_add bee-audit-debug default
if [ -d "$OVERLAY/etc" ]; then cp -r "$OVERLAY/etc/." "$tmp/etc/"; fi

View File

@@ -0,0 +1,23 @@
#!/sbin/openrc-run
description="Bee: load NVIDIA kernel modules"
depend() {
need localmount
before bee-audit-debug
}
start() {
ebegin "Loading NVIDIA modules"
# Run depmod so kernel can locate our modules in /lib/modules/.../extra/
depmod -a 2>/dev/null || true
for mod in nvidia nvidia-modeset nvidia-uvm; do
if modprobe "$mod" 2>/dev/null; then
einfo "loaded: $mod"
else
ewarn "failed to load: $mod"
fi
done
eend 0
}