Stabilize DCGM GPU discovery

2026-04-03 09:50:33 +03:00
parent 7f6386dccc
commit 7a843be6b0
3 changed files with 76 additions and 6 deletions
@@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true
 log "ldconfig refreshed"

 # Start DCGM host engine so dcgmi can discover GPUs.
-# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
-# "group is empty" even when GPUs and modules are present.
-# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
+# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
+# If it started too early (for example via systemd before bee-nvidia-load), it can
+# keep a stale empty inventory and dcgmi diag later reports no testable entities.
 if command -v nv-hostengine >/dev/null 2>&1; then
    if pgrep -x nv-hostengine >/dev/null 2>&1; then
-        log "nv-hostengine already running — skipping"
-    else
+        if command -v pkill >/dev/null 2>&1; then
+            pkill -x nv-hostengine >/dev/null 2>&1 || true
+            tries=0
+            while pgrep -x nv-hostengine >/dev/null 2>&1; do
+                tries=$((tries + 1))
+                if [ "${tries}" -ge 10 ]; then
+                    log "WARN: nv-hostengine is still running after restart request"
+                    break
+                fi
+                sleep 1
+            done
+            if pgrep -x nv-hostengine >/dev/null 2>&1; then
+                log "WARN: keeping existing nv-hostengine process"
+            else
+                log "nv-hostengine restarted"
+            fi
+        else
+            log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
+        fi
+    fi
+    if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        nv-hostengine
        log "nv-hostengine started"
    fi