The static KERNEL_PKG_VERSION pin was the root cause of nvidia-smi never working: modules were compiled for pinned version (e.g. 6.12.76-r0) but the ISO kernel was unpinned (latest from repo at build time). When Alpine updated linux-lts, the two diverged silently. Fix: both steps now use whatever linux-lts is current in Alpine 3.21 main at build time. build-nvidia-module.sh uses `apk add --update linux-lts-dev` (no version pin), mkimage gets the same package from the same mirror. Module cache is still keyed by detected KVER so rebuilds remain fast. Removed: KERNEL_VERSION, KERNEL_PKG_VERSION from VERSIONS, all pin references from build.sh and build-nvidia-module.sh. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
131 lines
5.2 KiB
Bash
131 lines
5.2 KiB
Bash
#!/bin/sh
|
|
# build-nvidia-module.sh — install NVIDIA proprietary driver into ISO overlay
|
|
#
|
|
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
|
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
|
#
|
|
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
|
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
|
#
|
|
# Output layout:
|
|
# $CACHE_DIR/modules/ — nvidia*.ko files
|
|
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
|
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
|
|
|
|
set -e
|
|
|
|
NVIDIA_VERSION="$1"
|
|
DIST_DIR="$2"
|
|
ALPINE_VERSION="$3"
|
|
|
|
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <alpine-version>"; exit 1; }
|
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <alpine-version>"; exit 1; }
|
|
[ -n "$ALPINE_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <alpine-version>"; exit 1; }
|
|
|
|
# Install linux-lts-dev (no version pin — always use whatever is current in Alpine 3.21 main).
|
|
# This ensures modules are compiled for the same kernel that mkimage will install in the ISO.
|
|
# Both use dl-cdn.alpinelinux.org, so they see the same package state at build time.
|
|
echo "=== installing linux-lts-dev (latest from dl-cdn) ==="
|
|
apk add --quiet --update \
|
|
--repository "https://dl-cdn.alpinelinux.org/alpine/v${ALPINE_VERSION}/main" \
|
|
linux-lts-dev
|
|
|
|
# Detect kernel version from installed headers (pick highest version if multiple).
|
|
detect_kver() {
|
|
ls /usr/src/ 2>/dev/null \
|
|
| grep '^linux-headers-' \
|
|
| sed 's/linux-headers-//' \
|
|
| sort -V \
|
|
| tail -1
|
|
}
|
|
|
|
KVER="$(detect_kver)"
|
|
KDIR="/usr/src/linux-headers-${KVER}"
|
|
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
|
|
|
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
|
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
|
|
echo "=== NVIDIA cached, skipping build ==="
|
|
echo "cache: $CACHE_DIR"
|
|
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
|
exit 0
|
|
fi
|
|
|
|
# Install build dependencies (linux-lts-dev already at correct version from above)
|
|
apk add --quiet \
|
|
--repository "https://dl-cdn.alpinelinux.org/alpine/v${ALPINE_VERSION}/main" \
|
|
gcc make perl linux-lts-dev wget
|
|
|
|
# Download official NVIDIA .run installer (proprietary) with sha256 verification
|
|
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
|
RUN_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
|
|
SHA_FILE="/var/tmp/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run.sha256sum"
|
|
|
|
verify_run() {
|
|
[ -s "$SHA_FILE" ] || return 1
|
|
[ -s "$RUN_FILE" ] || return 1
|
|
cd /var/tmp
|
|
sha256sum -c "$SHA_FILE" --status 2>/dev/null
|
|
}
|
|
|
|
if ! verify_run; then
|
|
rm -f "$RUN_FILE" "$SHA_FILE"
|
|
echo "=== downloading NVIDIA ${NVIDIA_VERSION} installer ==="
|
|
wget -q -O "$SHA_FILE" "${BASE_URL}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run.sha256sum"
|
|
echo "sha256: $(cat "$SHA_FILE")"
|
|
wget --show-progress -O "$RUN_FILE" "${BASE_URL}/NVIDIA-Linux-x86_64-${NVIDIA_VERSION}.run"
|
|
echo "=== verifying sha256 ==="
|
|
cd /var/tmp && sha256sum -c "$SHA_FILE" || { echo "ERROR: sha256 mismatch"; rm -f "$RUN_FILE"; exit 1; }
|
|
echo "sha256 OK"
|
|
else
|
|
echo "=== NVIDIA installer verified from cache ==="
|
|
fi
|
|
|
|
# Extract installer contents
|
|
echo "=== extracting installer ==="
|
|
chmod +x "$RUN_FILE"
|
|
EXTRACT_DIR="/var/tmp/nvidia-extract-${NVIDIA_VERSION}"
|
|
rm -rf "$EXTRACT_DIR"
|
|
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
|
|
|
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
|
|
KERNEL_SRC=""
|
|
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
|
done
|
|
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
|
echo "kernel source: $KERNEL_SRC"
|
|
|
|
# Build kernel modules from extracted source
|
|
echo "=== building kernel modules ($(nproc) cores) ==="
|
|
cd "$KERNEL_SRC"
|
|
make -j$(nproc) KERNEL_UNAME="$KVER" SYSSRC="$KDIR" modules 2>&1 | tail -5
|
|
|
|
# Collect outputs
|
|
mkdir -p "$CACHE_DIR/modules" "$CACHE_DIR/bin" "$CACHE_DIR/lib"
|
|
|
|
find "$KERNEL_SRC" -name '*.ko' -exec cp {} "$CACHE_DIR/modules/" \;
|
|
for ko in "$CACHE_DIR/modules/"*.ko; do
|
|
strip --strip-debug "$ko" 2>/dev/null || true
|
|
done
|
|
|
|
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
|
|
cp "$EXTRACT_DIR/nvidia-bug-report.sh" "$CACHE_DIR/bin/" 2>/dev/null || true
|
|
cp "$EXTRACT_DIR/libnvidia-ml.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
|
|
# libcuda stub needed by nvidia-smi at runtime
|
|
cp "$EXTRACT_DIR/libcuda.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
|
|
|
|
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat)
|
|
for lib in libnvidia-ml libcuda; do
|
|
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9]* 2>/dev/null | head -1)
|
|
[ -n "$versioned" ] || continue
|
|
base=$(basename "$versioned")
|
|
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1" 2>/dev/null || true
|
|
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
|
done
|
|
|
|
echo "=== NVIDIA build complete ==="
|
|
echo "cache: $CACHE_DIR"
|
|
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko | wc -l) .ko files"
|
|
ls -lh "$CACHE_DIR/bin/"
|