diff --git a/iso/builder/build-nvidia-module.sh b/iso/builder/build-nvidia-module.sh index 318aacc..bfc311e 100644 --- a/iso/builder/build-nvidia-module.sh +++ b/iso/builder/build-nvidia-module.sh @@ -46,7 +46,10 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}" CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}" DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads" EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract" +CACHE_LAYOUT_VERSION="2" +CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}" if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \ + && [ -f "$CACHE_LAYOUT_MARKER" ] \ && [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then echo "=== NVIDIA cached, skipping build ===" echo "cache: $CACHE_DIR" @@ -130,24 +133,30 @@ else echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)" fi -# Copy ALL userspace library files. -# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation -# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND. +# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names. +# Newer driver branches add extra runtime deps (for example OpenCL/compiler side +# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd +# but still fail with "no OpenCL platforms" because one dependent .so is absent. +copied_libs=0 +for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do + cp "$f" "$CACHE_DIR/lib/" + copied_libs=$((copied_libs+1)) +done + +if [ "$copied_libs" -eq 0 ]; then + echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR" + ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true + exit 1 +fi + for lib in \ libnvidia-ml \ libcuda \ libnvidia-ptxjitcompiler \ - libnvidia-opencl \ - libnvidia-compiler \ - libnvidia-nvvm \ - libnvidia-fatbinaryloader; do - count=0 - for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do - cp "$f" "$CACHE_DIR/lib/" && count=$((count+1)) - done - if [ "$count" -eq 0 ]; then - echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR" - ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true + libnvidia-opencl; do + if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then + echo "ERROR: required ${lib}.so.* not found in extracted userspace libs" + ls "$CACHE_DIR/lib/" | sort >&2 || true exit 1 fi done @@ -156,23 +165,17 @@ done ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) [ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; } -# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit) -for lib in \ - libnvidia-ml \ - libcuda \ - libnvidia-ptxjitcompiler \ - libnvidia-opencl \ - libnvidia-compiler \ - libnvidia-nvvm \ - libnvidia-fatbinaryloader; do - versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1) - [ -n "$versioned" ] || continue +# Create soname symlinks for every copied versioned library. +for versioned in "$CACHE_DIR"/lib/*.so.*; do + [ -f "$versioned" ] || continue base=$(basename "$versioned") - ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1" - ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true - echo "${lib}: .so.1 -> $base" + stem=${base%%.so.*} + ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1" + ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true done +touch "$CACHE_LAYOUT_MARKER" + echo "=== NVIDIA build complete ===" echo "cache: $CACHE_DIR" echo "modules: $ko_count .ko files" diff --git a/iso/builder/smoketest.sh b/iso/builder/smoketest.sh index 9f0d78c..b458523 100644 --- a/iso/builder/smoketest.sh +++ b/iso/builder/smoketest.sh @@ -109,6 +109,40 @@ else fail "nvidia-smi: not found in PATH" fi +echo "" +echo "-- OpenCL / John --" +if [ -f /etc/OpenCL/vendors/nvidia.icd ]; then + ok "OpenCL ICD present: /etc/OpenCL/vendors/nvidia.icd" +else + fail "OpenCL ICD missing: /etc/OpenCL/vendors/nvidia.icd" +fi + +if ldconfig -p 2>/dev/null | grep -q "libnvidia-opencl.so.1"; then + ok "libnvidia-opencl.so.1 present in linker cache" +else + fail "libnvidia-opencl.so.1 missing from linker cache" +fi + +if command -v clinfo >/dev/null 2>&1; then + if clinfo -l 2>/dev/null | grep -q "Platform"; then + ok "clinfo: OpenCL platform detected" + else + fail "clinfo: no OpenCL platform detected" + fi +else + fail "clinfo: not found in PATH" +fi + +if command -v john >/dev/null 2>&1; then + if john --list=opencl-devices 2>/dev/null | grep -q "Device #"; then + ok "john: OpenCL devices detected" + else + fail "john: no OpenCL devices detected" + fi +else + fail "john: not found in PATH" +fi + echo "" echo "-- lib symlinks --" for lib in libnvidia-ml libcuda; do