Files
bee/iso/builder/build-nvidia-module.sh

159 lines
6.3 KiB
Bash

#!/bin/sh
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
#
# Downloads the official NVIDIA .run installer, extracts kernel modules and
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
#
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
#
# Output layout:
# $CACHE_DIR/modules/ — nvidia*.ko files
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
set -e
NVIDIA_VERSION="$1"
DIST_DIR="$2"
DEBIAN_KERNEL_ABI="$3"
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
KVER="${DEBIAN_KERNEL_ABI}-amd64"
# On Debian, kernel headers are split into two packages:
# linux-headers-<kver> — arch-specific (generated, Makefile)
# linux-headers-<kver>-common — common source headers (linux/, asm-generic/, etc.)
# NVIDIA conftest needs SYSSRC pointing to common (for source headers like linux/mm.h)
# and SYSOUT pointing to amd64 (for generated headers like autoconf.h, asm/).
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
echo "=== installing linux-headers-${KVER} ==="
DEBIAN_FRONTEND=noninteractive apt-get install -y \
"linux-headers-${KVER}" \
gcc make perl
fi
echo "kernel headers (arch): $KDIR_ARCH"
echo "kernel headers (common): $KDIR_COMMON"
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
echo "=== NVIDIA cached, skipping build ==="
echo "cache: $CACHE_DIR"
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
exit 0
fi
# Download official NVIDIA .run installer with sha256 verification
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
RUN_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
SHA_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run.sha256sum"
verify_run() {
[ -s "$SHA_FILE" ] || return 1
[ -s "$RUN_FILE" ] || return 1
cd /var/tmp
sha256sum -c "$SHA_FILE" --status 2>/dev/null
}
if ! verify_run; then
rm -f "$RUN_FILE" "$SHA_FILE"
echo "=== downloading NVIDIA ${NVIDIA_VERSION} installer ==="
wget -q -O "$SHA_FILE" "${BASE_URL}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run.sha256sum"
echo "sha256: $(cat "$SHA_FILE")"
wget --show-progress -O "$RUN_FILE" "${BASE_URL}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
echo "=== verifying sha256 ==="
cd /var/tmp && sha256sum -c "$SHA_FILE" || { echo "ERROR: sha256 mismatch"; rm -f "$RUN_FILE"; exit 1; }
echo "sha256 OK"
else
echo "=== NVIDIA installer verified from cache ==="
fi
# Extract installer contents
echo "=== extracting installer ==="
chmod +x "$RUN_FILE"
EXTRACT_DIR="/var/tmp/nvidia-extract-${NVIDIA_VERSION}"
rm -rf "$EXTRACT_DIR"
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
KERNEL_SRC=""
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
done
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
echo "kernel source: $KERNEL_SRC"
# Build kernel modules
# CFLAGS_MODULE: add GCC include dir so NVIDIA's nv_stdarg.h can find stdarg.h.
# Kernel build uses -nostdinc which strips GCC's own includes; we restore it here.
echo "=== building kernel modules ($(nproc) cores) ==="
cd "$KERNEL_SRC"
# SYSSRC=common: conftest finds real kernel headers (linux/mm.h etc.)
# SYSOUT=amd64: generated headers (autoconf.h, asm/) from arch package
# Without this split, conftest uses amd64/include/ which is nearly empty,
# all compile-tests fail silently, and NVIDIA assumes all APIs present → link errors.
make -j$(nproc) \
KERNEL_UNAME="$KVER" \
SYSSRC="$KDIR_COMMON" \
SYSOUT="$KDIR_ARCH" \
modules 2>&1 | tail -10
# Collect outputs
mkdir -p "$CACHE_DIR/modules" "$CACHE_DIR/bin" "$CACHE_DIR/lib"
find "$KERNEL_SRC" -name '*.ko' -exec cp {} "$CACHE_DIR/modules/" \;
for ko in "$CACHE_DIR/modules/"*.ko; do
strip --strip-debug "$ko" 2>/dev/null || true
done
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
cp "$EXTRACT_DIR/nvidia-bug-report.sh" "$CACHE_DIR/bin/" 2>/dev/null || true
# Copy GSP firmware (required for Hopper/Ada GPUs — H100, H800, etc.)
mkdir -p "$CACHE_DIR/firmware"
if [ -d "$EXTRACT_DIR/firmware" ]; then
cp -r "$EXTRACT_DIR/firmware/." "$CACHE_DIR/firmware/"
echo "firmware: $(ls "$CACHE_DIR/firmware/" | wc -l) files"
else
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
fi
# Copy ALL userspace library files
for lib in libnvidia-ml libcuda; do
count=0
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
done
if [ "$count" -eq 0 ]; then
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
exit 1
fi
done
# Verify .ko files were built
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
for lib in libnvidia-ml libcuda; do
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
[ -n "$versioned" ] || continue
base=$(basename "$versioned")
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
echo "${lib}: .so.1 -> $base"
done
echo "=== NVIDIA build complete ==="
echo "cache: $CACHE_DIR"
echo "modules: $ko_count .ko files"
ls -lh "$CACHE_DIR/bin/" "$CACHE_DIR/lib/"