Add NVIDIA stress loader selection and DCGM 4 support

This commit is contained in:
Mikhail Chusavitin
2026-03-31 11:15:15 +03:00
parent 20f834aa96
commit 6dee8f3509
31 changed files with 789 additions and 111 deletions

View File

@@ -114,4 +114,19 @@ fi
ldconfig 2>/dev/null || true
log "ldconfig refreshed"
# Start DCGM host engine so dcgmi can discover GPUs.
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
# "group is empty" even when GPUs and modules are present.
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
if command -v nv-hostengine >/dev/null 2>&1; then
if pgrep -x nv-hostengine >/dev/null 2>&1; then
log "nv-hostengine already running — skipping"
else
nv-hostengine
log "nv-hostengine started"
fi
else
log "WARN: nv-hostengine not found — dcgmi diagnostics will not work"
fi
log "done"