diff --git a/audit/internal/app/blackbox.go b/audit/internal/app/blackbox.go index 80f65a9..3bd82be 100644 --- a/audit/internal/app/blackbox.go +++ b/audit/internal/app/blackbox.go @@ -365,7 +365,6 @@ func (w *blackboxWorker) currentFlushPeriod() time.Duration { func (w *blackboxWorker) finishCycle(duration time.Duration, err error) { w.mu.Lock() - defer w.mu.Unlock() w.lastDuration = duration if err != nil { w.status = "degraded" @@ -383,6 +382,10 @@ func (w *blackboxWorker) finishCycle(duration time.Duration, err error) { } w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles) } + w.mu.Unlock() + // persistState must be called without w.mu held: it acquires rt.mu then + // each worker.mu inside persistStateLocked, so holding w.mu here would + // cause a deadlock (w.mu → rt.mu → w.mu). w.runtime.persistState() } diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 35a8594..21387f6 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -182,9 +182,16 @@ func (s *System) DetectGPUVendor() string { return "amd" } if raw, err := exec.Command("lspci", "-nn").Output(); err == nil { - text := strings.ToLower(string(raw)) - if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") { - return "amd" + // Only match AMD GPU device classes [0300]=VGA, [0302]=3D controller, [0380]=Display. + // AMD CPUs also appear in lspci as "Advanced Micro Devices" (Root Complex, IOMMU, etc.) + // so matching vendor alone causes false positives on AMD CPU servers without GPUs. + for _, line := range strings.Split(strings.ToLower(string(raw)), "\n") { + if !strings.Contains(line, "advanced micro devices") && !strings.Contains(line, "amd/ati") { + continue + } + if strings.Contains(line, "[0300]") || strings.Contains(line, "[0302]") || strings.Contains(line, "[0380]") { + return "amd" + } } } return "" diff --git a/iso/builder/build.sh b/iso/builder/build.sh index 95e7e7c..04c7998 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -1419,6 +1419,13 @@ rm -rf \ if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load" rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service" + rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" + rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" + rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" + rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-recover" + rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-dcgmproftester-staggered" + rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-check-nvswitch" + rm -rf "${OVERLAY_STAGE_DIR}/etc/systemd/system/nvidia-fabricmanager.service.d" fi # --- inject authorized_keys for SSH access --- diff --git a/iso/overlay/usr/local/bin/bee-selfheal b/iso/overlay/usr/local/bin/bee-selfheal index a2b1325..1a5c11d 100644 --- a/iso/overlay/usr/local/bin/bee-selfheal +++ b/iso/overlay/usr/local/bin/bee-selfheal @@ -67,7 +67,8 @@ if ! mkdir "${LOCK_DIR}" 2>/dev/null; then fi trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT -if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then +GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo "") +if [ "$GPU_VENDOR" = "nvidia" ] && have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then log_event "NVIDIA GPU detected but /dev/nvidia0 is missing" restart_service bee-nvidia.service || true fi