fix: fail loudly on missing NVIDIA libs and .ko, improve mknod logging

build-nvidia-module.sh:
- Replace silent glob cp for libnvidia-ml/libcuda with find + explicit error
  if library not found in extract dir (catches installer layout changes)
- Fix circular symlink bug: don't create .so.1 -> .so.1 if versioned file
  is already named .so.1
- Verify .ko count > 0 after build, fail loudly if none produced
- Show lib cache in final summary

bee-nvidia:
- mknod failures are now logged with ewarn instead of silently suppressed
- If nvidia not in /proc/devices (no GPU hardware), log clearly and exit clean

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-08 17:07:47 +03:00
parent 11e001cafa
commit d952e10dbb
2 changed files with 37 additions and 14 deletions

View File

@@ -111,20 +111,36 @@ done
cp "$EXTRACT_DIR/nvidia-smi" "$CACHE_DIR/bin/"
cp "$EXTRACT_DIR/nvidia-bug-report.sh" "$CACHE_DIR/bin/" 2>/dev/null || true
cp "$EXTRACT_DIR/libnvidia-ml.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
# libcuda stub needed by nvidia-smi at runtime
cp "$EXTRACT_DIR/libcuda.so."* "$CACHE_DIR/lib/" 2>/dev/null || true
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat)
# Copy userspace libraries — use find to handle any versioning scheme (libnvidia-ml.so.X.Y.Z or .so.1)
for lib in libnvidia-ml libcuda; do
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9]* 2>/dev/null | head -1)
found=$(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" | head -1)
if [ -z "$found" ]; then
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
exit 1
fi
cp "$found" "$CACHE_DIR/lib/"
done
# Verify .ko files were actually built
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
# Create soname symlinks required by nvidia-smi on Alpine (musl/glibc via gcompat + libc6-compat)
for lib in libnvidia-ml libcuda; do
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."* 2>/dev/null | grep -v '\.so\.1$' | head -1)
[ -n "$versioned" ] || versioned=$(ls "$CACHE_DIR/lib/${lib}.so."* 2>/dev/null | head -1)
[ -n "$versioned" ] || continue
base=$(basename "$versioned")
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1" 2>/dev/null || true
# Only create .so.1 if versioned file is not already named .so.1
if [ "$base" != "${lib}.so.1" ]; then
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
fi
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
done
echo "=== NVIDIA build complete ==="
echo "cache: $CACHE_DIR"
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko | wc -l) .ko files"
ls -lh "$CACHE_DIR/bin/"
echo "modules: $ko_count .ko files"
ls -lh "$CACHE_DIR/bin/" "$CACHE_DIR/lib/"

View File

@@ -52,20 +52,27 @@ start() {
# Without /dev/nvidiactl nvidia-smi returns NVML_ERROR_LIBRARY_NOT_FOUND (exit 12).
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices 2>/dev/null | awk '{print $1}')
if [ -n "$nvidia_major" ]; then
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null || true
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null \
&& einfo "created /dev/nvidiactl (major $nvidia_major)" \
|| ewarn "/dev/nvidiactl already exists or mknod failed"
for i in 0 1 2 3 4 5 6 7; do
mknod -m 666 "/dev/nvidia$i" c "$nvidia_major" "$i" 2>/dev/null || true
done
einfo "created /dev/nvidiactl and /dev/nvidia{0-7} (major $nvidia_major)"
einfo "created /dev/nvidia{0-7}"
else
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware?"
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware present?"
eend 0
return 0
fi
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices 2>/dev/null | awk '{print $1}')
if [ -n "$uvm_major" ]; then
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null || true
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null \
&& einfo "created /dev/nvidia-uvm (major $uvm_major)" \
|| ewarn "/dev/nvidia-uvm already exists or mknod failed"
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 2>/dev/null || true
einfo "created /dev/nvidia-uvm (major $uvm_major)"
else
ewarn "/dev/nvidia-uvm: nvidia-uvm not in /proc/devices"
fi
eend 0