Add fabric manager boot and support diagnostics

This commit is contained in:
Mikhail Chusavitin
2026-04-15 16:14:26 +03:00
parent ab3ad77cd6
commit 7237e4d3e4
7 changed files with 65 additions and 0 deletions

View File

@@ -258,6 +258,22 @@ else
log "WARN: nvidia-smi not found — cannot enable persistence mode"
fi
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
# systems CUDA/DCGM can report "system not yet initialized" until fabric
# training completes under nvidia-fabricmanager.
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
log "nvidia-fabricmanager restarted"
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
log "nvidia-fabricmanager started"
else
log "WARN: failed to start nvidia-fabricmanager.service"
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
fi
else
log "WARN: nvidia-fabricmanager.service not installed"
fi
# Start DCGM host engine so dcgmi can discover GPUs.
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
# If it started too early (for example via systemd before bee-nvidia-load), it can