Add fabric manager boot and support diagnostics
This commit is contained in:
@@ -258,6 +258,22 @@ else
|
||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||
fi
|
||||
|
||||
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||
# training completes under nvidia-fabricmanager.
|
||||
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager restarted"
|
||||
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager started"
|
||||
else
|
||||
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||
fi
|
||||
else
|
||||
log "WARN: nvidia-fabricmanager.service not installed"
|
||||
fi
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||
|
||||
Reference in New Issue
Block a user