fix: fail loudly on missing NVIDIA libs and .ko, improve mknod logging
build-nvidia-module.sh: - Replace silent glob cp for libnvidia-ml/libcuda with find + explicit error if library not found in extract dir (catches installer layout changes) - Fix circular symlink bug: don't create .so.1 -> .so.1 if versioned file is already named .so.1 - Verify .ko count > 0 after build, fail loudly if none produced - Show lib cache in final summary bee-nvidia: - mknod failures are now logged with ewarn instead of silently suppressed - If nvidia not in /proc/devices (no GPU hardware), log clearly and exit clean Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -109,22 +109,38 @@ for ko in "$CACHE_DIR/modules/"*.ko; do
|
|||||||
strip --strip-debug "$ko" 2>/dev/null || true
|
strip --strip-debug "$ko" 2>/dev/null || true
|
||||||
done
|
done
|
||||||
|
|
||||||
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
|
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
|
||||||
cp "$EXTRACT_DIR/nvidia-bug-report.sh" "$CACHE_DIR/bin/" 2>/dev/null || true
|
cp "$EXTRACT_DIR/nvidia-bug-report.sh" "$CACHE_DIR/bin/" 2>/dev/null || true
|
||||||
cp "$EXTRACT_DIR/libnvidia-ml.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
|
|
||||||
# libcuda stub needed by nvidia-smi at runtime
|
|
||||||
cp "$EXTRACT_DIR/libcuda.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
|
|
||||||
|
|
||||||
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat)
|
# Copy userspace libraries — use find to handle any versioning scheme (libnvidia-ml.so.X.Y.Z or .so.1)
|
||||||
for lib in libnvidia-ml libcuda; do
|
for lib in libnvidia-ml libcuda; do
|
||||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9]* 2>/dev/null | head -1)
|
found=$(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" | head -1)
|
||||||
|
if [ -z "$found" ]; then
|
||||||
|
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
||||||
|
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
cp "$found" "$CACHE_DIR/lib/"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Verify .ko files were actually built
|
||||||
|
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||||
|
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||||
|
|
||||||
|
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat + libc6-compat)
|
||||||
|
for lib in libnvidia-ml libcuda; do
|
||||||
|
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."* 2>/dev/null | grep -v '\.so\.1$' | head -1)
|
||||||
|
[ -n "$versioned" ] || versioned=$(ls "$CACHE_DIR/lib/${lib}.so."* 2>/dev/null | head -1)
|
||||||
[ -n "$versioned" ] || continue
|
[ -n "$versioned" ] || continue
|
||||||
base=$(basename "$versioned")
|
base=$(basename "$versioned")
|
||||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1" 2>/dev/null || true
|
# Only create .so.1 if versioned file is not already named .so.1
|
||||||
|
if [ "$base" != "${lib}.so.1" ]; then
|
||||||
|
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
||||||
|
fi
|
||||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "=== NVIDIA build complete ==="
|
echo "=== NVIDIA build complete ==="
|
||||||
echo "cache: $CACHE_DIR"
|
echo "cache: $CACHE_DIR"
|
||||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko | wc -l) .ko files"
|
echo "modules: $ko_count .ko files"
|
||||||
ls -lh "$CACHE_DIR/bin/"
|
ls -lh "$CACHE_DIR/bin/" "$CACHE_DIR/lib/"
|
||||||
|
|||||||
@@ -52,20 +52,27 @@ start() {
|
|||||||
# Without /dev/nvidiactl nvidia-smi returns NVML_ERROR_LIBRARY_NOT_FOUND (exit 12).
|
# Without /dev/nvidiactl nvidia-smi returns NVML_ERROR_LIBRARY_NOT_FOUND (exit 12).
|
||||||
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices 2>/dev/null | awk '{print $1}')
|
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices 2>/dev/null | awk '{print $1}')
|
||||||
if [ -n "$nvidia_major" ]; then
|
if [ -n "$nvidia_major" ]; then
|
||||||
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null || true
|
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null \
|
||||||
|
&& einfo "created /dev/nvidiactl (major $nvidia_major)" \
|
||||||
|
|| ewarn "/dev/nvidiactl already exists or mknod failed"
|
||||||
for i in 0 1 2 3 4 5 6 7; do
|
for i in 0 1 2 3 4 5 6 7; do
|
||||||
mknod -m 666 "/dev/nvidia$i" c "$nvidia_major" "$i" 2>/dev/null || true
|
mknod -m 666 "/dev/nvidia$i" c "$nvidia_major" "$i" 2>/dev/null || true
|
||||||
done
|
done
|
||||||
einfo "created /dev/nvidiactl and /dev/nvidia{0-7} (major $nvidia_major)"
|
einfo "created /dev/nvidia{0-7}"
|
||||||
else
|
else
|
||||||
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware?"
|
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware present?"
|
||||||
|
eend 0
|
||||||
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices 2>/dev/null | awk '{print $1}')
|
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices 2>/dev/null | awk '{print $1}')
|
||||||
if [ -n "$uvm_major" ]; then
|
if [ -n "$uvm_major" ]; then
|
||||||
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null || true
|
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null \
|
||||||
|
&& einfo "created /dev/nvidia-uvm (major $uvm_major)" \
|
||||||
|
|| ewarn "/dev/nvidia-uvm already exists or mknod failed"
|
||||||
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 2>/dev/null || true
|
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 2>/dev/null || true
|
||||||
einfo "created /dev/nvidia-uvm (major $uvm_major)"
|
else
|
||||||
|
ewarn "/dev/nvidia-uvm: nvidia-uvm not in /proc/devices"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
eend 0
|
eend 0
|
||||||
|
|||||||
Reference in New Issue
Block a user