From ec9c65e20e5bde237b4ca9ae06e0f74b6fdcec2a Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 5 Mar 2026 18:01:11 +0300 Subject: [PATCH] feat: build NVIDIA open kernel modules during ISO build - build-nvidia-module.sh: downloads nvidia open-gpu-kernel-modules source, builds against linux-lts headers, extracts nvidia-smi from .run installer - modules cached by driver version + kernel version (rebuild only on update) - .ko files injected into ISO overlay at /lib/modules//extra/nvidia/ - bee-nvidia init script loads nvidia/nvidia-modeset/nvidia-uvm at boot - NVIDIA_DRIVER_VERSION=550.54.15 (Turing+, H100/A100 supported) Co-Authored-By: Claude Sonnet 4.6 --- iso/builder/build-debug.sh | 23 +++++- iso/builder/build-nvidia-module.sh | 96 +++++++++++++++++++++++++ iso/builder/genapkovl-bee_debug.sh | 1 + iso/overlay-debug/etc/init.d/bee-nvidia | 23 ++++++ 4 files changed, 141 insertions(+), 2 deletions(-) create mode 100644 iso/builder/build-nvidia-module.sh create mode 100644 iso/overlay-debug/etc/init.d/bee-nvidia diff --git a/iso/builder/build-debug.sh b/iso/builder/build-debug.sh index 340788d..0529ad9 100644 --- a/iso/builder/build-debug.sh +++ b/iso/builder/build-debug.sh @@ -2,8 +2,7 @@ # build-debug.sh — build bee debug ISO with SSH access # # Debug ISO purpose: test audit binary on real hardware. -# Includes dropbear SSH, all audit packages, audit binary. -# Does NOT include NVIDIA driver (added in production build). +# Includes dropbear SSH, all audit packages, audit binary, NVIDIA open kernel modules. # # Run on Alpine builder VM as root after setup-builder.sh. # Usage: @@ -101,6 +100,26 @@ mkdir -p "${OVERLAY_DIR}/usr/local/bin" cp "${DIST_DIR}/bee-audit-linux-amd64" "${OVERLAY_DIR}/usr/local/bin/audit" chmod +x "${OVERLAY_DIR}/usr/local/bin/audit" +# --- build NVIDIA kernel modules and inject into overlay --- +echo "" +echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ===" +sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" + +# Determine kernel version (same as what goes into the ISO — both use linux-lts from same Alpine) +KVER=$(ls /usr/src/ 2>/dev/null | grep '^linux-headers-' | sed 's/linux-headers-//' | head -1) +NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}" + +# Inject .ko files into overlay at /lib/modules//extra/nvidia/ +OVERLAY_KMOD_DIR="${OVERLAY_DIR}/lib/modules/${KVER}/extra/nvidia" +mkdir -p "${OVERLAY_KMOD_DIR}" +cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/" + +# Inject nvidia-smi and libnvidia-ml +mkdir -p "${OVERLAY_DIR}/usr/local/bin" "${OVERLAY_DIR}/usr/lib" +cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_DIR}/usr/local/bin/" +chmod +x "${OVERLAY_DIR}/usr/local/bin/nvidia-smi" +cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_DIR}/usr/lib/" 2>/dev/null || true + # --- build ISO using mkimage --- mkdir -p "${DIST_DIR}" echo "" diff --git a/iso/builder/build-nvidia-module.sh b/iso/builder/build-nvidia-module.sh new file mode 100644 index 0000000..3bd3da4 --- /dev/null +++ b/iso/builder/build-nvidia-module.sh @@ -0,0 +1,96 @@ +#!/bin/sh +# build-nvidia-module.sh — build NVIDIA open kernel modules and extract nvidia-smi +# +# Builds NVIDIA open-gpu-kernel-modules from source against the installed linux-lts +# kernel headers. Output is cached in DIST_DIR/nvidia--/ so +# subsequent builds are instant unless NVIDIA_DRIVER_VERSION or kernel changes. +# +# Output layout: +# $CACHE_DIR/modules/ — nvidia*.ko files (stripped) +# $CACHE_DIR/bin/ — nvidia-smi +# $CACHE_DIR/lib/ — libnvidia-ml.so.1, libcuda stub (for nvidia-smi) + +set -e + +NVIDIA_VERSION="$1" +DIST_DIR="$2" + +[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 "; exit 1; } +[ -n "$DIST_DIR" ] || { echo "usage: $0 "; exit 1; } + +# Detect kernel version from installed headers +KVER=$(ls /usr/src/ 2>/dev/null | grep '^linux-headers-' | sed 's/linux-headers-//' | head -1) +if [ -z "$KVER" ]; then + echo "=== installing linux-lts-dev ===" + apk add --quiet linux-lts-dev + KVER=$(ls /usr/src/ | grep '^linux-headers-' | sed 's/linux-headers-//' | head -1) +fi +KDIR="/usr/src/linux-headers-${KVER}" +echo "=== NVIDIA ${NVIDIA_VERSION} for kernel ${KVER} ===" + +CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}" +if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then + echo "=== NVIDIA modules cached, skipping build ===" + echo "cache: $CACHE_DIR" + echo "modules: $(ls $CACHE_DIR/modules/*.ko 2>/dev/null | wc -l) .ko files" + exit 0 +fi + +# Install build dependencies +echo "=== installing build deps ===" +apk add --quiet gcc make perl linux-lts-dev wget tar + +# Download and build open kernel modules +BUILD_TMP="/var/tmp/nvidia-build" +rm -rf "$BUILD_TMP" +mkdir -p "$BUILD_TMP" + +SRC_TGZ="/var/tmp/nvidia-open-${NVIDIA_VERSION}.tar.gz" +if [ ! -f "$SRC_TGZ" ]; then + echo "=== downloading NVIDIA open kernel modules source ===" + wget -q -O "$SRC_TGZ" \ + "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${NVIDIA_VERSION}.tar.gz" +fi + +echo "=== extracting source ===" +tar -xzf "$SRC_TGZ" -C "$BUILD_TMP" +SRC_DIR="$BUILD_TMP/open-gpu-kernel-modules-${NVIDIA_VERSION}" + +echo "=== building kernel modules ($(nproc) cores) ===" +cd "$SRC_DIR" +make -j$(nproc) \ + KERNEL_SOURCE_PATH="$KDIR" \ + IGNORE_MISSING_MODULE_SYMVERS=1 \ + modules 2>&1 | tail -5 + +# Collect .ko files +mkdir -p "$CACHE_DIR/modules" +find "$SRC_DIR" -name '*.ko' -exec cp {} "$CACHE_DIR/modules/" \; +# Strip debug info from modules to reduce size +for ko in "$CACHE_DIR"/modules/*.ko; do + strip --strip-debug "$ko" 2>/dev/null || true +done +echo "modules: $(ls $CACHE_DIR/modules/*.ko | wc -l) .ko files" + +# Extract nvidia-smi and required libraries from the .run installer +RUN_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run" +if [ ! -f "$RUN_FILE" ]; then + echo "=== downloading NVIDIA installer (for nvidia-smi) ===" + wget -q -O "$RUN_FILE" \ + "https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run" +fi + +echo "=== extracting nvidia-smi ===" +chmod +x "$RUN_FILE" +EXTRACT_DIR="/var/tmp/nvidia-extract-${NVIDIA_VERSION}" +rm -rf "$EXTRACT_DIR" +"$RUN_FILE" --extract-only --target "$EXTRACT_DIR" 2>/dev/null || true + +mkdir -p "$CACHE_DIR/bin" "$CACHE_DIR/lib" +cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/" +cp "$EXTRACT_DIR/libnvidia-ml.so.1" "$CACHE_DIR/lib/" 2>/dev/null || true +cp "$EXTRACT_DIR/libnvidia-ml.so."* "$CACHE_DIR/lib/" 2>/dev/null || true + +echo "=== NVIDIA build complete ===" +echo "cache: $CACHE_DIR" +ls -lh "$CACHE_DIR/bin/" "$CACHE_DIR/modules/" diff --git a/iso/builder/genapkovl-bee_debug.sh b/iso/builder/genapkovl-bee_debug.sh index 64b1ec8..1378d42 100755 --- a/iso/builder/genapkovl-bee_debug.sh +++ b/iso/builder/genapkovl-bee_debug.sh @@ -67,6 +67,7 @@ rc_add savecache shutdown rc_add bee-sshsetup default rc_add bee-network default rc_add dropbear default +rc_add bee-nvidia default rc_add bee-audit-debug default if [ -d "$OVERLAY/etc" ]; then cp -r "$OVERLAY/etc/." "$tmp/etc/"; fi diff --git a/iso/overlay-debug/etc/init.d/bee-nvidia b/iso/overlay-debug/etc/init.d/bee-nvidia new file mode 100644 index 0000000..f4f6fe3 --- /dev/null +++ b/iso/overlay-debug/etc/init.d/bee-nvidia @@ -0,0 +1,23 @@ +#!/sbin/openrc-run + +description="Bee: load NVIDIA kernel modules" + +depend() { + need localmount + before bee-audit-debug +} + +start() { + ebegin "Loading NVIDIA modules" + # Run depmod so kernel can locate our modules in /lib/modules/.../extra/ + depmod -a 2>/dev/null || true + + for mod in nvidia nvidia-modeset nvidia-uvm; do + if modprobe "$mod" 2>/dev/null; then + einfo "loaded: $mod" + else + ewarn "failed to load: $mod" + fi + done + eend 0 +}