fix: fail loudly on missing NVIDIA libs and .ko, improve mknod logging

build-nvidia-module.sh:
- Replace silent glob cp for libnvidia-ml/libcuda with find + explicit error
  if library not found in extract dir (catches installer layout changes)
- Fix circular symlink bug: don't create .so.1 -> .so.1 if versioned file
  is already named .so.1
- Verify .ko count > 0 after build, fail loudly if none produced
- Show lib cache in final summary

bee-nvidia:
- mknod failures are now logged with ewarn instead of silently suppressed
- If nvidia not in /proc/devices (no GPU hardware), log clearly and exit clean

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-08 17:07:47 +03:00
parent 11e001cafa
commit d952e10dbb
2 changed files with 37 additions and 14 deletions

View File

@@ -52,20 +52,27 @@ start() {
# Without /dev/nvidiactl nvidia-smi returns NVML_ERROR_LIBRARY_NOT_FOUND (exit 12).
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices 2>/dev/null | awk '{print $1}')
if [ -n "$nvidia_major" ]; then
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null || true
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null \
&& einfo "created /dev/nvidiactl (major $nvidia_major)" \
|| ewarn "/dev/nvidiactl already exists or mknod failed"
for i in 0 1 2 3 4 5 6 7; do
mknod -m 666 "/dev/nvidia$i" c "$nvidia_major" "$i" 2>/dev/null || true
done
einfo "created /dev/nvidiactl and /dev/nvidia{0-7} (major $nvidia_major)"
einfo "created /dev/nvidia{0-7}"
else
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware?"
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware present?"
eend 0
return 0
fi
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices 2>/dev/null | awk '{print $1}')
if [ -n "$uvm_major" ]; then
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null || true
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null \
&& einfo "created /dev/nvidia-uvm (major $uvm_major)" \
|| ewarn "/dev/nvidia-uvm already exists or mknod failed"
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 2>/dev/null || true
einfo "created /dev/nvidia-uvm (major $uvm_major)"
else
ewarn "/dev/nvidia-uvm: nvidia-uvm not in /proc/devices"
fi
eend 0