Files
bee/iso/builder/build-nvidia-module.sh
Mikhail Chusavitin 1768bb58dd Merge debug/prod into single ISO build, fix NVIDIA module loading
## ISO build consolidation
- Remove separate debug/prod split: overlay-debug/, build-debug.sh,
  mkimg.bee_debug.sh, genapkovl-bee_debug.sh all deleted
- Single overlay: iso/overlay/ (was overlay-debug content)
- Single build script: build.sh (SSH, TUI, NVIDIA, vendor tools, bee-release)
- Single mkimage profile: bee (with dropbear, dialog, strace, gcompat, etc.)

## NVIDIA fixes
- Modules now stored at /usr/local/lib/nvidia/ instead of
  /lib/modules/<kver>/extra/nvidia/ — modloop squashfs mounts over that
  path at boot making overlay content there inaccessible
- bee-nvidia init: load via insmod (absolute path), not modprobe
- bee-nvidia init: create libnvidia-ml.so.1/libcuda.so.1 symlinks in /usr/lib/
- build-nvidia-module.sh: always install linux-lts-dev (not conditional) —
  stale 6.6.x headers caused wrong-kernel modules that never loaded at runtime
- build-nvidia-module.sh: create soname symlinks in cache
- KERNEL_VERSION in VERSIONS updated 6.6 → 6.12
- gcompat added to ISO packages (nvidia-smi is a glibc binary on musl Alpine)

## Service ordering
- bee-audit: add `after bee-nvidia` so NVIDIA enrichment always succeeds

## New tooling
- iso/builder/smoketest.sh: SSH smoke test for post-boot ISO validation
- iso/builder/build-gpu-burn.sh: builds gpu_burn vendor binary (CUDA 12.8+)
- vendor/gpu_burn included automatically if placed in iso/vendor/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 20:14:18 +03:00

123 lines
4.6 KiB
Bash

#!/bin/sh
# build-nvidia-module.sh — install NVIDIA proprietary driver into ISO overlay
#
# Downloads the official NVIDIA .run installer, extracts kernel modules and
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
#
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
#
# Output layout:
# $CACHE_DIR/modules/ — nvidia*.ko files
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
set -e
NVIDIA_VERSION="$1"
DIST_DIR="$2"
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir>"; exit 1; }
# Always install linux-lts-dev to ensure headers match the ISO's kernel (Alpine 3.21 = 6.12.x).
# Without this, a builder with stale 6.6.x headers produces modules for the wrong kernel version.
echo "=== installing linux-lts-dev ==="
apk add --quiet linux-lts-dev
# Detect kernel version from installed headers (pick highest version if multiple).
detect_kver() {
ls /usr/src/ 2>/dev/null \
| grep '^linux-headers-' \
| sed 's/linux-headers-//' \
| sort -V \
| tail -1
}
KVER="$(detect_kver)"
KDIR="/usr/src/linux-headers-${KVER}"
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
echo "=== NVIDIA cached, skipping build ==="
echo "cache: $CACHE_DIR"
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
exit 0
fi
# Install build dependencies
apk add --quiet gcc make perl linux-lts-dev wget
# Download official NVIDIA .run installer (proprietary) with sha256 verification
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
RUN_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
SHA_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run.sha256sum"
verify_run() {
[ -s "$SHA_FILE" ] || return 1
[ -s "$RUN_FILE" ] || return 1
cd /var/tmp
sha256sum -c "$SHA_FILE" --status 2>/dev/null
}
if ! verify_run; then
rm -f "$RUN_FILE" "$SHA_FILE"
echo "=== downloading NVIDIA ${NVIDIA_VERSION} installer ==="
wget -q -O "$SHA_FILE" "${BASE_URL}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run.sha256sum"
echo "sha256: $(cat "$SHA_FILE")"
wget --show-progress -O "$RUN_FILE" "${BASE_URL}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
echo "=== verifying sha256 ==="
cd /var/tmp && sha256sum -c "$SHA_FILE" || { echo "ERROR: sha256 mismatch"; rm -f "$RUN_FILE"; exit 1; }
echo "sha256 OK"
else
echo "=== NVIDIA installer verified from cache ==="
fi
# Extract installer contents
echo "=== extracting installer ==="
chmod +x "$RUN_FILE"
EXTRACT_DIR="/var/tmp/nvidia-extract-${NVIDIA_VERSION}"
rm -rf "$EXTRACT_DIR"
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
KERNEL_SRC=""
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
done
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
echo "kernel source: $KERNEL_SRC"
# Build kernel modules from extracted source
echo "=== building kernel modules ($(nproc) cores) ==="
cd "$KERNEL_SRC"
make -j$(nproc) KERNEL_UNAME="$KVER" SYSSRC="$KDIR" modules 2>&1 | tail -5
# Collect outputs
mkdir -p "$CACHE_DIR/modules" "$CACHE_DIR/bin" "$CACHE_DIR/lib"
find "$EXTRACT_DIR/kernel" -name '*.ko' -exec cp {} "$CACHE_DIR/modules/" \;
for ko in "$CACHE_DIR/modules/"*.ko; do
strip --strip-debug "$ko" 2>/dev/null || true
done
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
cp "$EXTRACT_DIR/libnvidia-ml.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
# libcuda stub needed by nvidia-smi at runtime
cp "$EXTRACT_DIR/libcuda.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat)
for lib in libnvidia-ml libcuda; do
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9]* 2>/dev/null | head -1)
[ -n "$versioned" ] || continue
base=$(basename "$versioned")
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1" 2>/dev/null || true
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
done
echo "=== NVIDIA build complete ==="
echo "cache: $CACHE_DIR"
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko | wc -l) .ko files"
ls -lh "$CACHE_DIR/bin/"