Fix NVIDIA persistence mode and add benchmark results table

This commit is contained in:
Mikhail Chusavitin
2026-04-06 10:47:07 +03:00
parent 6e94216f3b
commit fc5c100a29
6 changed files with 296 additions and 62 deletions

View File

@@ -209,6 +209,18 @@ fi
ldconfig 2>/dev/null || true
log "ldconfig refreshed"
# Keep persistence mode enabled across the session so dcgmi / stress tools do
# not fail with deployment warnings on otherwise healthy GPUs.
if command -v nvidia-smi >/dev/null 2>&1; then
if nvidia-smi -pm 1 >/dev/null 2>&1; then
log "enabled NVIDIA persistence mode"
else
log "WARN: failed to enable NVIDIA persistence mode"
fi
else
log "WARN: nvidia-smi not found — cannot enable persistence mode"
fi
# Start DCGM host engine so dcgmi can discover GPUs.
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
# If it started too early (for example via systemd before bee-nvidia-load), it can