Fix AMD GPU false detection, blackbox deadlock, and NOGPU build bloat

- sat.go: DetectGPUVendor lspci fallback now checks GPU device classes ([0300]/[0302]/[0380]) per line instead of scanning the whole output for vendor name; AMD EPYC servers have dozens of AMD-branded PCIe entries (Root Complex, IOMMU, Host Bridge) that were triggering the old check - blackbox.go: fix deadlock in finishCycle — it held w.mu while calling persistState(), which acquires rt.mu then re-acquires w.mu inside persistStateLocked(); now w.mu is released before persistState() - build.sh: remove NVIDIA-specific overlay files (bee-gpu-burn, bee-john-gpu-stress, bee-nccl-gpu-stress, bee-nvidia-recover, bee-dcgmproftester-staggered, bee-check-nvswitch, nvidia-fabricmanager.service.d/) for non-nvidia build variants - bee-selfheal: gate NVIDIA recovery on BEE_GPU_VENDOR=nvidia so the script does not attempt to restart bee-nvidia.service on NOGPU builds Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-19 09:37:26 +03:00
parent cf29131116
commit 7f27b9aa38
4 changed files with 23 additions and 5 deletions
--- a/audit/internal/app/blackbox.go
+++ b/audit/internal/app/blackbox.go
@@ -365,7 +365,6 @@ func (w *blackboxWorker) currentFlushPeriod() time.Duration {

 func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
 	w.mu.Lock()
-	defer w.mu.Unlock()
 	w.lastDuration = duration
 	if err != nil {
 		w.status = "degraded"
@@ -383,6 +382,10 @@ func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
 		}
 		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
 	}
+	w.mu.Unlock()
+	// persistState must be called without w.mu held: it acquires rt.mu then
+	// each worker.mu inside persistStateLocked, so holding w.mu here would
+	// cause a deadlock (w.mu → rt.mu → w.mu).
 	w.runtime.persistState()
 }