1 Commits

Author SHA1 Message Date
Mikhail Chusavitin
bd94b6c792 fix(iso): add libnvidia-ptxjitcompiler + ldconfig for PTX JIT and NCCL
- build-nvidia-module.sh: copy libnvidia-ptxjitcompiler.so.* alongside
  libcuda/libnvidia-ml — required by cuModuleLoadDataEx for PTX JIT.
  Without it: CUDA_ERROR_JIT_COMPILER_NOT_FOUND at runtime.
  Cache check updated to force rebuild when ptxjitcompiler is missing.
- bee-nvidia-load: run ldconfig after module load so that NVIDIA/NCCL
  libs injected into /usr/lib/ are visible to dlopen() callers.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 10:37:27 +03:00
2 changed files with 12 additions and 4 deletions

View File

@@ -46,7 +46,8 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}" CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads" DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract" EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
echo "=== NVIDIA cached, skipping build ===" echo "=== NVIDIA cached, skipping build ==="
echo "cache: $CACHE_DIR" echo "cache: $CACHE_DIR"
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files" echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
@@ -129,8 +130,10 @@ else
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)" echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
fi fi
# Copy ALL userspace library files # Copy ALL userspace library files.
for lib in libnvidia-ml libcuda; do # libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
count=0 count=0
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1)) cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
@@ -147,7 +150,7 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; } [ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit) # Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
for lib in libnvidia-ml libcuda; do for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1) versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
[ -n "$versioned" ] || continue [ -n "$versioned" ] || continue
base=$(basename "$versioned") base=$(basename "$versioned")

View File

@@ -100,4 +100,9 @@ if [ -n "$uvm_major" ]; then
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true
fi fi
# Refresh dynamic linker cache so that NVIDIA/NCCL libs injected into /usr/lib/
# are visible to dlopen() calls (libcuda, libnvidia-ptxjitcompiler, libnccl, etc.)
ldconfig 2>/dev/null || true
log "ldconfig refreshed"
log "done" log "done"