#!/bin/sh # bee-nvidia-load — load NVIDIA kernel modules and create device nodes # Called by bee-nvidia.service at boot. NVIDIA_KO_DIR="/usr/local/lib/nvidia" log() { echo "[bee-nvidia] $*"; } read_nvidia_modules_flavor() { if [ -f /etc/bee-nvidia-modules-flavor ]; then flavor="$(tr -d '[:space:]' /dev/null)" case "$flavor" in open|proprietary) echo "$flavor" return 0 ;; esac fi echo "proprietary" } log "kernel: $(uname -r)" # Skip if no NVIDIA display/compute GPU is present. # Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de. have_nvidia_gpu() { lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }' } if ! have_nvidia_gpu; then log "no NVIDIA GPU detected — skipping module load" exit 0 fi if [ ! -d "$NVIDIA_KO_DIR" ]; then log "ERROR: NVIDIA module dir missing: $NVIDIA_KO_DIR" exit 1 fi log "module dir: $NVIDIA_KO_DIR" ls "$NVIDIA_KO_DIR"/*.ko 2>/dev/null | sed 's/^/ /' || true cmdline_param() { key="$1" for token in $(cat /proc/cmdline 2>/dev/null); do case "$token" in "$key"=*) echo "${token#*=}" return 0 ;; esac done return 1 } nvidia_mode="$(cmdline_param bee.nvidia.mode || true)" if [ -z "$nvidia_mode" ]; then nvidia_mode="normal" fi log "boot mode: $nvidia_mode" nvidia_modules_flavor="$(read_nvidia_modules_flavor)" log "modules flavor: $nvidia_modules_flavor" load_module() { mod="$1" shift ko="$NVIDIA_KO_DIR/${mod}.ko" [ -f "$ko" ] || ko="$NVIDIA_KO_DIR/${mod//-/_}.ko" if [ ! -f "$ko" ]; then log "WARN: not found: $ko" return 1 fi if timeout 90 insmod "$ko" "$@"; then log "loaded: $mod $*" return 0 fi log "WARN: failed to load: $mod (exit $?)" dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true return 1 } nvidia_is_functional() { grep -q ' nvidiactl$' /proc/devices 2>/dev/null } load_module_with_gsp_fallback() { ko="$NVIDIA_KO_DIR/nvidia.ko" if [ ! -f "$ko" ]; then log "ERROR: not found: $ko" return 1 fi # Run insmod in background — on some converted SXM→PCIe cards GSP enters an # infinite crash/reload loop and insmod never returns. We check for successful # initialization by polling /proc/devices for nvidiactl instead of waiting for # insmod to exit. log "loading nvidia (GSP enabled, timeout 90s)" insmod "$ko" & _insmod_pid=$! _waited=0 while [ $_waited -lt 90 ]; do if nvidia_is_functional; then log "loaded: nvidia (GSP enabled, ${_waited}s)" echo "gsp-on" > /run/bee-nvidia-mode return 0 fi # Check if insmod exited with an error before timeout if ! kill -0 "$_insmod_pid" 2>/dev/null; then wait "$_insmod_pid" _rc=$? if [ $_rc -ne 0 ]; then log "nvidia load failed (exit $_rc)" dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true return 1 fi # insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment sleep 2 if nvidia_is_functional; then log "loaded: nvidia (GSP enabled, ${_waited}s)" return 0 fi log "insmod exited 0 but nvidiactl missing — treating as failure" return 1 fi sleep 1 _waited=$((_waited + 1)) done # GSP init timed out — kill the hanging insmod and attempt gsp-off fallback log "nvidia GSP init timed out after 90s" kill "$_insmod_pid" 2>/dev/null || true wait "$_insmod_pid" 2>/dev/null || true # Attempt to unload the partially-initialized module if ! rmmod nvidia 2>/dev/null; then # Module is stuck in the kernel — cannot reload with different params. # User must reboot and select bee.nvidia.mode=gsp-off at boot menu. log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel" log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu" echo "gsp-stuck" > /run/bee-nvidia-mode return 1 fi sleep 2 log "retrying with NVreg_EnableGpuFirmware=0" log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware" if insmod "$ko" NVreg_EnableGpuFirmware=0; then if nvidia_is_functional; then log "loaded: nvidia (GSP disabled)" echo "gsp-off" > /run/bee-nvidia-mode return 0 fi log "insmod gsp-off exited 0 but nvidiactl missing" return 1 fi log "nvidia load failed (GSP=off)" dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true return 1 } load_host_module() { mod="$1" if modprobe "$mod" >/dev/null 2>&1; then log "host module loaded: $mod" return 0 fi return 1 } if [ "$nvidia_modules_flavor" = "open" ]; then case "$nvidia_mode" in gsp-off|safe|nomsi) log "ignoring boot mode ${nvidia_mode} for open NVIDIA modules" ;; esac if ! load_module nvidia; then exit 1 fi # nvidia-modeset on some server kernels needs ACPI video helper symbols # exported by the generic "video" module. Best-effort only; compute paths # remain functional even if display-related modules stay absent. load_host_module video || true load_module nvidia-modeset || true load_module nvidia-uvm || true else case "$nvidia_mode" in normal|full) if ! load_module_with_gsp_fallback; then exit 1 fi # nvidia-modeset on some server kernels needs ACPI video helper symbols # exported by the generic "video" module. Best-effort only; compute paths # remain functional even if display-related modules stay absent. load_host_module video || true load_module nvidia-modeset || true load_module nvidia-uvm || true ;; gsp-off|safe) # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the # conservative path for platforms where full boot-time GSP init is unstable. if ! load_module nvidia NVreg_EnableGpuFirmware=0; then exit 1 fi log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot" ;; nomsi|*) # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits). # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs. if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then exit 1 fi log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm" ;; esac fi # Create /dev/nvidia* device nodes (udev rules absent since we use .run installer) nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}') if [ -n "$nvidia_major" ]; then mknod -m 666 /dev/nvidiactl c "$nvidia_major" 255 \ && log "created /dev/nvidiactl (major $nvidia_major)" \ || log "WARN: /dev/nvidiactl already exists or mknod failed" for i in 0 1 2 3 4 5 6 7; do mknod -m 666 "/dev/nvidia$i" c "$nvidia_major" "$i" || true done log "created /dev/nvidia{0-7}" else log "WARN: nvidiactl not in /proc/devices — no GPU hardware present?" fi uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices | awk '{print $1}') if [ -n "$uvm_major" ]; then mknod -m 666 /dev/nvidia-uvm c "$uvm_major" 0 \ && log "created /dev/nvidia-uvm (major $uvm_major)" \ || log "WARN: /dev/nvidia-uvm already exists" mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true fi # Refresh dynamic linker cache so that NVIDIA/NCCL libs injected into /usr/lib/ # are visible to dlopen() calls (libcuda, libnvidia-ptxjitcompiler, libnccl, etc.) ldconfig 2>/dev/null || true log "ldconfig refreshed" # Keep persistence mode enabled across the session so dcgmi / stress tools do # not fail with deployment warnings on otherwise healthy GPUs. if command -v nvidia-smi >/dev/null 2>&1; then if nvidia-smi -pm 1 >/dev/null 2>&1; then log "enabled NVIDIA persistence mode" else log "WARN: failed to enable NVIDIA persistence mode" fi else log "WARN: nvidia-smi not found — cannot enable persistence mode" fi # Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch # systems CUDA/DCGM can report "system not yet initialized" until fabric # training completes under nvidia-fabricmanager. if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then log "nvidia-fabricmanager restarted" elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then log "nvidia-fabricmanager started" else log "WARN: failed to start nvidia-fabricmanager.service" systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true fi else log "WARN: nvidia-fabricmanager.service not installed" fi # Start DCGM host engine so dcgmi can discover GPUs. # nv-hostengine must run after the NVIDIA modules and device nodes are ready. # If it started too early (for example via systemd before bee-nvidia-load), it can # keep a stale empty inventory and dcgmi diag later reports no testable entities. if command -v nv-hostengine >/dev/null 2>&1; then if pgrep -x nv-hostengine >/dev/null 2>&1; then if command -v pkill >/dev/null 2>&1; then pkill -x nv-hostengine >/dev/null 2>&1 || true tries=0 while pgrep -x nv-hostengine >/dev/null 2>&1; do tries=$((tries + 1)) if [ "${tries}" -ge 10 ]; then log "WARN: nv-hostengine is still running after restart request" break fi sleep 1 done if pgrep -x nv-hostengine >/dev/null 2>&1; then log "WARN: keeping existing nv-hostengine process" else log "nv-hostengine restarted" fi else log "WARN: pkill not found — cannot refresh nv-hostengine inventory" fi fi if ! pgrep -x nv-hostengine >/dev/null 2>&1; then nv-hostengine log "nv-hostengine started" fi else log "WARN: nv-hostengine not found — dcgmi diagnostics will not work" fi log "done"