Add fabric manager boot and support diagnostics
This commit is contained in:
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
|
|||||||
"bee-selfheal.service",
|
"bee-selfheal.service",
|
||||||
"bee-selfheal.timer",
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
|
"nvidia-dcgm.service",
|
||||||
|
"nvidia-fabricmanager.service",
|
||||||
}
|
}
|
||||||
|
|
||||||
var supportBundleCommands = []struct {
|
var supportBundleCommands = []struct {
|
||||||
@@ -48,6 +50,43 @@ else
|
|||||||
fi
|
fi
|
||||||
`}},
|
`}},
|
||||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
nvidia-smi topo -m 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "nvidia-smi not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v systemctl >/dev/null 2>&1; then
|
||||||
|
echo "systemctl not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo "=== unit files ==="
|
||||||
|
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== active units ==="
|
||||||
|
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== failed units ==="
|
||||||
|
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||||
|
`}},
|
||||||
|
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||||
|
for candidate in \
|
||||||
|
/usr/bin/nvidia-fabricmanager \
|
||||||
|
/usr/bin/nv-fabricmanager \
|
||||||
|
/usr/bin/nvidia-fabricmanagerd \
|
||||||
|
/usr/bin/nvlsm; do
|
||||||
|
if [ -e "$candidate" ]; then
|
||||||
|
echo "=== $candidate ==="
|
||||||
|
ls -l "$candidate" 2>&1 || true
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||||
|
echo "no fabric manager binaries found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||||
if ! command -v lspci >/dev/null 2>&1; then
|
if ! command -v lspci >/dev/null 2>&1; then
|
||||||
echo "lspci not found"
|
echo "lspci not found"
|
||||||
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
|
|||||||
}{
|
}{
|
||||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
|
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||||
|
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||||
|
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||||
|
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||||
}
|
}
|
||||||
|
|
||||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||||
|
|||||||
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
|
|||||||
"bee-audit",
|
"bee-audit",
|
||||||
"bee-web",
|
"bee-web",
|
||||||
"bee-sshsetup",
|
"bee-sshsetup",
|
||||||
|
"nvidia-dcgm",
|
||||||
|
"nvidia-fabricmanager",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
DEBIAN_VERSION=12
|
DEBIAN_VERSION=12
|
||||||
DEBIAN_KERNEL_ABI=auto
|
DEBIAN_KERNEL_ABI=auto
|
||||||
NVIDIA_DRIVER_VERSION=590.48.01
|
NVIDIA_DRIVER_VERSION=590.48.01
|
||||||
|
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||||
NCCL_VERSION=2.28.9-1
|
NCCL_VERSION=2.28.9-1
|
||||||
NCCL_CUDA_VERSION=13.0
|
NCCL_CUDA_VERSION=13.0
|
||||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||||
|
|||||||
@@ -1262,6 +1262,7 @@ fi
|
|||||||
# --- substitute version placeholders in package list and archive ---
|
# --- substitute version placeholders in package list and archive ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
sed -i \
|
sed -i \
|
||||||
|
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
|||||||
# Enable GPU-vendor specific services
|
# Enable GPU-vendor specific services
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||||
systemctl enable bee-nvidia.service
|
systemctl enable bee-nvidia.service
|
||||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||||
# explicitly.
|
# explicitly.
|
||||||
|
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
|||||||
@@ -258,6 +258,22 @@ else
|
|||||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||||
|
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||||
|
# training completes under nvidia-fabricmanager.
|
||||||
|
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||||
|
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||||
|
log "nvidia-fabricmanager restarted"
|
||||||
|
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||||
|
log "nvidia-fabricmanager started"
|
||||||
|
else
|
||||||
|
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||||
|
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nvidia-fabricmanager.service not installed"
|
||||||
|
fi
|
||||||
|
|
||||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||||
|
|||||||
Reference in New Issue
Block a user