- bee-nvidia-load: run insmod in background, poll /proc/devices for nvidiactl; if GSP init doesn't complete in 90s, kill insmod and retry with NVreg_EnableGpuFirmware=0. Handles EBUSY case with clear error. - Write /run/bee-nvidia-mode (gsp-on/gsp-off/gsp-stuck) for audit layer - Show GSP mode badge in sidebar: yellow for gsp-off, red for gsp-stuck - Report NvidiaGSPMode in RuntimeHealth with issue entries - Simplify GRUB menu: default (KMS+GSP), advanced submenu (GSP=off, nomodeset, fail-safe), remove load-to-RAM entry - Add pcmanfm, ristretto, mupdf, mousepad to desktop packages Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
247 lines
8.3 KiB
Bash
Executable File
247 lines
8.3 KiB
Bash
Executable File
#!/bin/sh
|
|
# bee-nvidia-load — load NVIDIA kernel modules and create device nodes
|
|
# Called by bee-nvidia.service at boot.
|
|
|
|
NVIDIA_KO_DIR="/usr/local/lib/nvidia"
|
|
|
|
log() { echo "[bee-nvidia] $*"; }
|
|
|
|
log "kernel: $(uname -r)"
|
|
|
|
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
|
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
|
|
log "no NVIDIA GPU detected — skipping module load"
|
|
exit 0
|
|
fi
|
|
|
|
if [ ! -d "$NVIDIA_KO_DIR" ]; then
|
|
log "ERROR: NVIDIA module dir missing: $NVIDIA_KO_DIR"
|
|
exit 1
|
|
fi
|
|
|
|
log "module dir: $NVIDIA_KO_DIR"
|
|
ls "$NVIDIA_KO_DIR"/*.ko 2>/dev/null | sed 's/^/ /' || true
|
|
|
|
cmdline_param() {
|
|
key="$1"
|
|
for token in $(cat /proc/cmdline 2>/dev/null); do
|
|
case "$token" in
|
|
"$key"=*)
|
|
echo "${token#*=}"
|
|
return 0
|
|
;;
|
|
esac
|
|
done
|
|
return 1
|
|
}
|
|
|
|
nvidia_mode="$(cmdline_param bee.nvidia.mode || true)"
|
|
if [ -z "$nvidia_mode" ]; then
|
|
nvidia_mode="normal"
|
|
fi
|
|
log "boot mode: $nvidia_mode"
|
|
|
|
load_module() {
|
|
mod="$1"
|
|
shift
|
|
ko="$NVIDIA_KO_DIR/${mod}.ko"
|
|
[ -f "$ko" ] || ko="$NVIDIA_KO_DIR/${mod//-/_}.ko"
|
|
if [ ! -f "$ko" ]; then
|
|
log "WARN: not found: $ko"
|
|
return 1
|
|
fi
|
|
if timeout 90 insmod "$ko" "$@"; then
|
|
log "loaded: $mod $*"
|
|
return 0
|
|
fi
|
|
log "WARN: failed to load: $mod (exit $?)"
|
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
|
return 1
|
|
}
|
|
|
|
nvidia_is_functional() {
|
|
grep -q ' nvidiactl$' /proc/devices 2>/dev/null
|
|
}
|
|
|
|
load_module_with_gsp_fallback() {
|
|
ko="$NVIDIA_KO_DIR/nvidia.ko"
|
|
if [ ! -f "$ko" ]; then
|
|
log "ERROR: not found: $ko"
|
|
return 1
|
|
fi
|
|
|
|
# Run insmod in background — on some converted SXM→PCIe cards GSP enters an
|
|
# infinite crash/reload loop and insmod never returns. We check for successful
|
|
# initialization by polling /proc/devices for nvidiactl instead of waiting for
|
|
# insmod to exit.
|
|
log "loading nvidia (GSP enabled, timeout 90s)"
|
|
insmod "$ko" &
|
|
_insmod_pid=$!
|
|
|
|
_waited=0
|
|
while [ $_waited -lt 90 ]; do
|
|
if nvidia_is_functional; then
|
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
|
echo "gsp-on" > /run/bee-nvidia-mode
|
|
return 0
|
|
fi
|
|
# Check if insmod exited with an error before timeout
|
|
if ! kill -0 "$_insmod_pid" 2>/dev/null; then
|
|
wait "$_insmod_pid"
|
|
_rc=$?
|
|
if [ $_rc -ne 0 ]; then
|
|
log "nvidia load failed (exit $_rc)"
|
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
|
return 1
|
|
fi
|
|
# insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
|
|
sleep 2
|
|
if nvidia_is_functional; then
|
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
|
return 0
|
|
fi
|
|
log "insmod exited 0 but nvidiactl missing — treating as failure"
|
|
return 1
|
|
fi
|
|
sleep 1
|
|
_waited=$((_waited + 1))
|
|
done
|
|
|
|
# GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
|
|
log "nvidia GSP init timed out after 90s"
|
|
kill "$_insmod_pid" 2>/dev/null || true
|
|
wait "$_insmod_pid" 2>/dev/null || true
|
|
|
|
# Attempt to unload the partially-initialized module
|
|
if ! rmmod nvidia 2>/dev/null; then
|
|
# Module is stuck in the kernel — cannot reload with different params.
|
|
# User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
|
|
log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
|
|
log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
|
|
echo "gsp-stuck" > /run/bee-nvidia-mode
|
|
return 1
|
|
fi
|
|
|
|
sleep 2
|
|
log "retrying with NVreg_EnableGpuFirmware=0"
|
|
log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
|
|
|
|
if insmod "$ko" NVreg_EnableGpuFirmware=0; then
|
|
if nvidia_is_functional; then
|
|
log "loaded: nvidia (GSP disabled)"
|
|
echo "gsp-off" > /run/bee-nvidia-mode
|
|
return 0
|
|
fi
|
|
log "insmod gsp-off exited 0 but nvidiactl missing"
|
|
return 1
|
|
fi
|
|
|
|
log "nvidia load failed (GSP=off)"
|
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
|
return 1
|
|
}
|
|
|
|
load_host_module() {
|
|
mod="$1"
|
|
if modprobe "$mod" >/dev/null 2>&1; then
|
|
log "host module loaded: $mod"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
case "$nvidia_mode" in
|
|
normal|full)
|
|
if ! load_module_with_gsp_fallback; then
|
|
exit 1
|
|
fi
|
|
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
|
# exported by the generic "video" module. Best-effort only; compute paths
|
|
# remain functional even if display-related modules stay absent.
|
|
load_host_module video || true
|
|
load_module nvidia-modeset || true
|
|
load_module nvidia-uvm || true
|
|
;;
|
|
gsp-off|safe)
|
|
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
|
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
|
# conservative path for platforms where full boot-time GSP init is unstable.
|
|
if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
|
|
exit 1
|
|
fi
|
|
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
|
;;
|
|
nomsi|*)
|
|
# nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
|
|
# "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
|
|
# NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
|
|
if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
|
|
exit 1
|
|
fi
|
|
log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
|
|
;;
|
|
esac
|
|
|
|
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
|
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
|
|
if [ -n "$nvidia_major" ]; then
|
|
mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 \
|
|
&& log "created /dev/nvidiactl (major $nvidia_major)" \
|
|
|| log "WARN: /dev/nvidiactl already exists or mknod failed"
|
|
for i in 0 1 2 3 4 5 6 7; do
|
|
mknod -m 666 "/dev/nvidia$i" c "$nvidia_major" "$i" || true
|
|
done
|
|
log "created /dev/nvidia{0-7}"
|
|
else
|
|
log "WARN: nvidiactl not in /proc/devices — no GPU hardware present?"
|
|
fi
|
|
|
|
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices | awk '{print $1}')
|
|
if [ -n "$uvm_major" ]; then
|
|
mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 \
|
|
&& log "created /dev/nvidia-uvm (major $uvm_major)" \
|
|
|| log "WARN: /dev/nvidia-uvm already exists"
|
|
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true
|
|
fi
|
|
|
|
# Refresh dynamic linker cache so that NVIDIA/NCCL libs injected into /usr/lib/
|
|
# are visible to dlopen() calls (libcuda, libnvidia-ptxjitcompiler, libnccl, etc.)
|
|
ldconfig 2>/dev/null || true
|
|
log "ldconfig refreshed"
|
|
|
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
|
# keep a stale empty inventory and dcgmi diag later reports no testable entities.
|
|
if command -v nv-hostengine >/dev/null 2>&1; then
|
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
|
if command -v pkill >/dev/null 2>&1; then
|
|
pkill -x nv-hostengine >/dev/null 2>&1 || true
|
|
tries=0
|
|
while pgrep -x nv-hostengine >/dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ "${tries}" -ge 10 ]; then
|
|
log "WARN: nv-hostengine is still running after restart request"
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
|
log "WARN: keeping existing nv-hostengine process"
|
|
else
|
|
log "nv-hostengine restarted"
|
|
fi
|
|
else
|
|
log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
|
|
fi
|
|
fi
|
|
if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
|
nv-hostengine
|
|
log "nv-hostengine started"
|
|
fi
|
|
else
|
|
log "WARN: nv-hostengine not found — dcgmi diagnostics will not work"
|
|
fi
|
|
|
|
log "done"
|