fix: fail loudly on missing NVIDIA libs and .ko, improve mknod logging
build-nvidia-module.sh: - Replace silent glob cp for libnvidia-ml/libcuda with find + explicit error if library not found in extract dir (catches installer layout changes) - Fix circular symlink bug: don't create .so.1 -> .so.1 if versioned file is already named .so.1 - Verify .ko count > 0 after build, fail loudly if none produced - Show lib cache in final summary bee-nvidia: - mknod failures are now logged with ewarn instead of silently suppressed - If nvidia not in /proc/devices (no GPU hardware), log clearly and exit clean Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -111,20 +111,36 @@ done
|
||||
|
||||
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
|
||||
cp "$EXTRACT_DIR/nvidia-bug-report.sh" "$CACHE_DIR/bin/" 2>/dev/null || true
|
||||
cp "$EXTRACT_DIR/libnvidia-ml.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
|
||||
# libcuda stub needed by nvidia-smi at runtime
|
||||
cp "$EXTRACT_DIR/libcuda.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
|
||||
|
||||
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat)
|
||||
# Copy userspace libraries — use find to handle any versioning scheme (libnvidia-ml.so.X.Y.Z or .so.1)
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9]* 2>/dev/null | head -1)
|
||||
found=$(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" | head -1)
|
||||
if [ -z "$found" ]; then
|
||||
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
||||
exit 1
|
||||
fi
|
||||
cp "$found" "$CACHE_DIR/lib/"
|
||||
done
|
||||
|
||||
# Verify .ko files were actually built
|
||||
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat + libc6-compat)
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."* 2>/dev/null | grep -v '\.so\.1$' | head -1)
|
||||
[ -n "$versioned" ] || versioned=$(ls "$CACHE_DIR/lib/${lib}.so."* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1" 2>/dev/null || true
|
||||
# Only create .so.1 if versioned file is not already named .so.1
|
||||
if [ "$base" != "${lib}.so.1" ]; then
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
||||
fi
|
||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
||||
done
|
||||
|
||||
echo "=== NVIDIA build complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko | wc -l) .ko files"
|
||||
ls -lh "$CACHE_DIR/bin/"
|
||||
echo "modules: $ko_count .ko files"
|
||||
ls -lh "$CACHE_DIR/bin/" "$CACHE_DIR/lib/"
|
||||
|
||||
@@ -52,20 +52,27 @@ start() {
|
||||
# Without /dev/nvidiactl nvidia-smi returns NVML_ERROR_LIBRARY_NOT_FOUND (exit 12).
|
||||
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices 2>/dev/null | awk '{print $1}')
|
||||
if [ -n "$nvidia_major" ]; then
|
||||
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null || true
|
||||
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null \
|
||||
&& einfo "created /dev/nvidiactl (major $nvidia_major)" \
|
||||
|| ewarn "/dev/nvidiactl already exists or mknod failed"
|
||||
for i in 0 1 2 3 4 5 6 7; do
|
||||
mknod -m 666 "/dev/nvidia$i" c "$nvidia_major" "$i" 2>/dev/null || true
|
||||
done
|
||||
einfo "created /dev/nvidiactl and /dev/nvidia{0-7} (major $nvidia_major)"
|
||||
einfo "created /dev/nvidia{0-7}"
|
||||
else
|
||||
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware?"
|
||||
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware present?"
|
||||
eend 0
|
||||
return 0
|
||||
fi
|
||||
|
||||
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices 2>/dev/null | awk '{print $1}')
|
||||
if [ -n "$uvm_major" ]; then
|
||||
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null || true
|
||||
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null \
|
||||
&& einfo "created /dev/nvidia-uvm (major $uvm_major)" \
|
||||
|| ewarn "/dev/nvidia-uvm already exists or mknod failed"
|
||||
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 2>/dev/null || true
|
||||
einfo "created /dev/nvidia-uvm (major $uvm_major)"
|
||||
else
|
||||
ewarn "/dev/nvidia-uvm: nvidia-uvm not in /proc/devices"
|
||||
fi
|
||||
|
||||
eend 0
|
||||
|
||||
Reference in New Issue
Block a user