Alpine uses mdev which has no rules for NVIDIA devices. Without /dev/nvidiactl
and /dev/nvidia{0-7}, nvidia-smi returns NVML_ERROR_LIBRARY_NOT_FOUND (exit 12)
even though kernel modules are loaded and libraries are present.
Fix: after insmod, read major numbers from /proc/devices and mknod the required
character devices (/dev/nvidiactl, /dev/nvidia{0-7}, /dev/nvidia-uvm).
Add /dev/nvidia* node checks to smoketest for earlier failure detection.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
73 lines
2.5 KiB
Plaintext
Executable File
73 lines
2.5 KiB
Plaintext
Executable File
#!/sbin/openrc-run
|
|
|
|
description="Bee: load NVIDIA kernel modules"
|
|
|
|
NVIDIA_KO_DIR="/usr/local/lib/nvidia"
|
|
|
|
depend() {
|
|
need localmount
|
|
before bee-audit
|
|
}
|
|
|
|
start() {
|
|
ebegin "Loading NVIDIA modules"
|
|
einfo "kernel: $(uname -r)"
|
|
|
|
if [ ! -d "$NVIDIA_KO_DIR" ]; then
|
|
ewarn "NVIDIA module dir missing: $NVIDIA_KO_DIR"
|
|
eend 1
|
|
return 1
|
|
fi
|
|
|
|
einfo "module dir: $NVIDIA_KO_DIR"
|
|
ls "$NVIDIA_KO_DIR"/*.ko 2>/dev/null | sed 's/^/ /' || true
|
|
|
|
# Create libnvidia-ml soname symlinks needed by nvidia-smi (glibc binary on Alpine/musl)
|
|
for lib in libnvidia-ml libcuda; do
|
|
versioned=$(ls /usr/lib/${lib}.so.[0-9]* 2>/dev/null | head -1)
|
|
[ -n "$versioned" ] || continue
|
|
base=$(basename "$versioned")
|
|
ln -sf "$base" "/usr/lib/${lib}.so.1" 2>/dev/null || true
|
|
ln -sf "${lib}.so.1" "/usr/lib/${lib}.so" 2>/dev/null || true
|
|
done
|
|
|
|
# Load modules via insmod (bypasses modules.dep — modloop squashfs is read-only)
|
|
for mod in nvidia nvidia-modeset nvidia-uvm; do
|
|
ko="$NVIDIA_KO_DIR/${mod}.ko"
|
|
[ -f "$ko" ] || ko="$NVIDIA_KO_DIR/${mod//-/_}.ko"
|
|
if [ -f "$ko" ]; then
|
|
if insmod "$ko" 2>/dev/null; then
|
|
einfo "loaded: $mod"
|
|
else
|
|
ewarn "failed to load: $mod"
|
|
dmesg | tail -n 5 | sed 's/^/ dmesg: /' || true
|
|
fi
|
|
else
|
|
ewarn "not found: $ko"
|
|
fi
|
|
done
|
|
|
|
# Create /dev/nvidia* device nodes — mdev on Alpine does not have NVIDIA rules,
|
|
# so the kernel hotplug events are not handled and nodes are never created.
|
|
# Without /dev/nvidiactl nvidia-smi returns NVML_ERROR_LIBRARY_NOT_FOUND (exit 12).
|
|
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices 2>/dev/null | awk '{print $1}')
|
|
if [ -n "$nvidia_major" ]; then
|
|
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 2>/dev/null || true
|
|
for i in 0 1 2 3 4 5 6 7; do
|
|
mknod -m 666 "/dev/nvidia$i" c "$nvidia_major" "$i" 2>/dev/null || true
|
|
done
|
|
einfo "created /dev/nvidiactl and /dev/nvidia{0-7} (major $nvidia_major)"
|
|
else
|
|
ewarn "/dev/nvidiactl: nvidia not in /proc/devices — no GPU hardware?"
|
|
fi
|
|
|
|
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices 2>/dev/null | awk '{print $1}')
|
|
if [ -n "$uvm_major" ]; then
|
|
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 2>/dev/null || true
|
|
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 2>/dev/null || true
|
|
einfo "created /dev/nvidia-uvm (major $uvm_major)"
|
|
fi
|
|
|
|
eend 0
|
|
}
|