Add fabric manager boot and support diagnostics

This commit is contained in:
Mikhail Chusavitin
2026-04-15 16:14:26 +03:00
parent ab3ad77cd6
commit 7237e4d3e4
7 changed files with 65 additions and 0 deletions

View File

@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
# Enable GPU-vendor specific services
if [ "$GPU_VENDOR" = "nvidia" ]; then
systemctl enable nvidia-dcgm.service 2>/dev/null || true
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
systemctl enable bee-nvidia.service
elif [ "$GPU_VENDOR" = "amd" ]; then
# ROCm symlinks (packages install to /opt/rocm-*/bin/)

View File

@@ -5,6 +5,7 @@
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
# explicitly.
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%