From 7237e4d3e4aecc48f2cead0bffc61cbe4f7a46f8 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 15 Apr 2026 16:14:26 +0300 Subject: [PATCH] Add fabric manager boot and support diagnostics --- audit/internal/app/support_bundle.go | 43 +++++++++++++++++++ audit/internal/platform/runtime.go | 2 + iso/builder/VERSIONS | 1 + iso/builder/build.sh | 1 + .../hooks/normal/9000-bee-setup.hook.chroot | 1 + .../package-lists/bee-nvidia.list.chroot | 1 + iso/overlay/usr/local/bin/bee-nvidia-load | 16 +++++++ 7 files changed, 65 insertions(+) diff --git a/audit/internal/app/support_bundle.go b/audit/internal/app/support_bundle.go index 7be6e8d..b1a97f8 100644 --- a/audit/internal/app/support_bundle.go +++ b/audit/internal/app/support_bundle.go @@ -22,6 +22,8 @@ var supportBundleServices = []string{ "bee-selfheal.service", "bee-selfheal.timer", "bee-sshsetup.service", + "nvidia-dcgm.service", + "nvidia-fabricmanager.service", } var supportBundleCommands = []struct { @@ -48,6 +50,43 @@ else fi `}}, {name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}}, + {name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", ` +if command -v nvidia-smi >/dev/null 2>&1; then + nvidia-smi topo -m 2>&1 || true +else + echo "nvidia-smi not found" +fi +`}}, + {name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", ` +if ! command -v systemctl >/dev/null 2>&1; then + echo "systemctl not found" + exit 0 +fi +echo "=== unit files ===" +systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true +echo +echo "=== active units ===" +systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true +echo +echo "=== failed units ===" +systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units" +`}}, + {name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", ` +for candidate in \ + /usr/bin/nvidia-fabricmanager \ + /usr/bin/nv-fabricmanager \ + /usr/bin/nvidia-fabricmanagerd \ + /usr/bin/nvlsm; do + if [ -e "$candidate" ]; then + echo "=== $candidate ===" + ls -l "$candidate" 2>&1 || true + echo + fi +done +if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then + echo "no fabric manager binaries found" +fi +`}}, {name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", ` if ! command -v lspci >/dev/null 2>&1; then echo "lspci not found" @@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct { }{ {name: "system/kern.log", src: "/var/log/kern.log"}, {name: "system/syslog.txt", src: "/var/log/syslog"}, + {name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"}, + {name: "system/nvlsm.log", src: "/var/log/nvlsm.log"}, + {name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"}, + {name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"}, } const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz" diff --git a/audit/internal/platform/runtime.go b/audit/internal/platform/runtime.go index 601e676..6fb2df3 100644 --- a/audit/internal/platform/runtime.go +++ b/audit/internal/platform/runtime.go @@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{ "bee-audit", "bee-web", "bee-sshsetup", + "nvidia-dcgm", + "nvidia-fabricmanager", } func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) { diff --git a/iso/builder/VERSIONS b/iso/builder/VERSIONS index 3cd4a93..fd9857d 100644 --- a/iso/builder/VERSIONS +++ b/iso/builder/VERSIONS @@ -1,6 +1,7 @@ DEBIAN_VERSION=12 DEBIAN_KERNEL_ABI=auto NVIDIA_DRIVER_VERSION=590.48.01 +NVIDIA_FABRICMANAGER_VERSION=590.48.01-1 NCCL_VERSION=2.28.9-1 NCCL_CUDA_VERSION=13.0 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186 diff --git a/iso/builder/build.sh b/iso/builder/build.sh index 04de8da..d86b246 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -1262,6 +1262,7 @@ fi # --- substitute version placeholders in package list and archive --- if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then sed -i \ + -e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \ -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \ "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot" elif [ "$BEE_GPU_VENDOR" = "amd" ]; then diff --git a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot index 8fee9b8..35de676 100755 --- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot +++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot @@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true # Enable GPU-vendor specific services if [ "$GPU_VENDOR" = "nvidia" ]; then systemctl enable nvidia-dcgm.service 2>/dev/null || true + systemctl enable nvidia-fabricmanager.service 2>/dev/null || true systemctl enable bee-nvidia.service elif [ "$GPU_VENDOR" = "amd" ]; then # ROCm symlinks (packages install to /opt/rocm-*/bin/) diff --git a/iso/builder/config/package-lists/bee-nvidia.list.chroot b/iso/builder/config/package-lists/bee-nvidia.list.chroot index 351aff7..13ae433 100644 --- a/iso/builder/config/package-lists/bee-nvidia.list.chroot +++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot @@ -5,6 +5,7 @@ # DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with # CUDA 13 userspace, so install the CUDA 13 build plus proprietary components # explicitly. +nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%% datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%% diff --git a/iso/overlay/usr/local/bin/bee-nvidia-load b/iso/overlay/usr/local/bin/bee-nvidia-load index 048a545..6da8281 100755 --- a/iso/overlay/usr/local/bin/bee-nvidia-load +++ b/iso/overlay/usr/local/bin/bee-nvidia-load @@ -258,6 +258,22 @@ else log "WARN: nvidia-smi not found — cannot enable persistence mode" fi +# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch +# systems CUDA/DCGM can report "system not yet initialized" until fabric +# training completes under nvidia-fabricmanager. +if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then + if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then + log "nvidia-fabricmanager restarted" + elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then + log "nvidia-fabricmanager started" + else + log "WARN: failed to start nvidia-fabricmanager.service" + systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true + fi +else + log "WARN: nvidia-fabricmanager.service not installed" +fi + # Start DCGM host engine so dcgmi can discover GPUs. # nv-hostengine must run after the NVIDIA modules and device nodes are ready. # If it started too early (for example via systemd before bee-nvidia-load), it can