Fix AMD GPU false detection, blackbox deadlock, and NOGPU build bloat
- sat.go: DetectGPUVendor lspci fallback now checks GPU device classes ([0300]/[0302]/[0380]) per line instead of scanning the whole output for vendor name; AMD EPYC servers have dozens of AMD-branded PCIe entries (Root Complex, IOMMU, Host Bridge) that were triggering the old check - blackbox.go: fix deadlock in finishCycle — it held w.mu while calling persistState(), which acquires rt.mu then re-acquires w.mu inside persistStateLocked(); now w.mu is released before persistState() - build.sh: remove NVIDIA-specific overlay files (bee-gpu-burn, bee-john-gpu-stress, bee-nccl-gpu-stress, bee-nvidia-recover, bee-dcgmproftester-staggered, bee-check-nvswitch, nvidia-fabricmanager.service.d/) for non-nvidia build variants - bee-selfheal: gate NVIDIA recovery on BEE_GPU_VENDOR=nvidia so the script does not attempt to restart bee-nvidia.service on NOGPU builds Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -365,7 +365,6 @@ func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
|||||||
|
|
||||||
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||||
w.mu.Lock()
|
w.mu.Lock()
|
||||||
defer w.mu.Unlock()
|
|
||||||
w.lastDuration = duration
|
w.lastDuration = duration
|
||||||
if err != nil {
|
if err != nil {
|
||||||
w.status = "degraded"
|
w.status = "degraded"
|
||||||
@@ -383,6 +382,10 @@ func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
|||||||
}
|
}
|
||||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||||
}
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
// persistState must be called without w.mu held: it acquires rt.mu then
|
||||||
|
// each worker.mu inside persistStateLocked, so holding w.mu here would
|
||||||
|
// cause a deadlock (w.mu → rt.mu → w.mu).
|
||||||
w.runtime.persistState()
|
w.runtime.persistState()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -182,11 +182,18 @@ func (s *System) DetectGPUVendor() string {
|
|||||||
return "amd"
|
return "amd"
|
||||||
}
|
}
|
||||||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||||
text := strings.ToLower(string(raw))
|
// Only match AMD GPU device classes [0300]=VGA, [0302]=3D controller, [0380]=Display.
|
||||||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
// AMD CPUs also appear in lspci as "Advanced Micro Devices" (Root Complex, IOMMU, etc.)
|
||||||
|
// so matching vendor alone causes false positives on AMD CPU servers without GPUs.
|
||||||
|
for _, line := range strings.Split(strings.ToLower(string(raw)), "\n") {
|
||||||
|
if !strings.Contains(line, "advanced micro devices") && !strings.Contains(line, "amd/ati") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.Contains(line, "[0300]") || strings.Contains(line, "[0302]") || strings.Contains(line, "[0380]") {
|
||||||
return "amd"
|
return "amd"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1419,6 +1419,13 @@ rm -rf \
|
|||||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||||
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn"
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress"
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress"
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-recover"
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-dcgmproftester-staggered"
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-check-nvswitch"
|
||||||
|
rm -rf "${OVERLAY_STAGE_DIR}/etc/systemd/system/nvidia-fabricmanager.service.d"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- inject authorized_keys for SSH access ---
|
# --- inject authorized_keys for SSH access ---
|
||||||
|
|||||||
@@ -67,7 +67,8 @@ if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
|||||||
fi
|
fi
|
||||||
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
||||||
|
|
||||||
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo "")
|
||||||
|
if [ "$GPU_VENDOR" = "nvidia" ] && have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||||
log_event "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
log_event "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||||
restart_service bee-nvidia.service || true
|
restart_service bee-nvidia.service || true
|
||||||
fi
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user