diff --git a/iso/builder/build-nvidia-module.sh b/iso/builder/build-nvidia-module.sh index f1ab741..853ba43 100644 --- a/iso/builder/build-nvidia-module.sh +++ b/iso/builder/build-nvidia-module.sh @@ -46,7 +46,8 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}" CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}" DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads" EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract" -if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then +if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \ + && [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then echo "=== NVIDIA cached, skipping build ===" echo "cache: $CACHE_DIR" echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files" @@ -129,8 +130,10 @@ else echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)" fi -# Copy ALL userspace library files -for lib in libnvidia-ml libcuda; do +# Copy ALL userspace library files. +# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation +# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND. +for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do count=0 for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do cp "$f" "$CACHE_DIR/lib/" && count=$((count+1)) @@ -147,7 +150,7 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) [ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; } # Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit) -for lib in libnvidia-ml libcuda; do +for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1) [ -n "$versioned" ] || continue base=$(basename "$versioned") diff --git a/iso/overlay/usr/local/bin/bee-nvidia-load b/iso/overlay/usr/local/bin/bee-nvidia-load index 5b182c3..4af1448 100755 --- a/iso/overlay/usr/local/bin/bee-nvidia-load +++ b/iso/overlay/usr/local/bin/bee-nvidia-load @@ -100,4 +100,9 @@ if [ -n "$uvm_major" ]; then mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true fi +# Refresh dynamic linker cache so that NVIDIA/NCCL libs injected into /usr/lib/ +# are visible to dlopen() calls (libcuda, libnvidia-ptxjitcompiler, libnccl, etc.) +ldconfig 2>/dev/null || true +log "ldconfig refreshed" + log "done"