feat(dcgm): add NVIDIA DCGM diagnostics, fix KVM console
- Add 9002-nvidia-dcgm.hook.chroot: installs datacenter-gpu-manager from NVIDIA apt repo during live-build - Enable nvidia-dcgm.service in chroot setup hook - Replace bee-gpu-stress with dcgmi diag (levels 1-4) in NVIDIA SAT - TUI: replace GPU checkbox + duration UI with DCGM level selection - Remove console=tty2 from boot params: KVM/VGA now shows tty1 where bee-tui runs, fixing unresponsive console Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,6 @@ lb config noauto \
|
||||
--memtest none \
|
||||
--iso-volume "EASY-BEE" \
|
||||
--iso-application "EASY-BEE" \
|
||||
--bootappend-live "boot=live components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live components console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--apt-recommends false \
|
||||
"${@}"
|
||||
|
||||
@@ -21,6 +21,7 @@ ensure_bee_console_user() {
|
||||
ensure_bee_console_user
|
||||
|
||||
# Enable bee services
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
systemctl enable bee-network.service
|
||||
systemctl enable bee-nvidia.service
|
||||
systemctl enable bee-preflight.service
|
||||
|
||||
66
iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot
Executable file
66
iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/bin/sh
|
||||
# 9002-nvidia-dcgm.hook.chroot — install NVIDIA DCGM inside the live-build chroot.
|
||||
# DCGM (Data Center GPU Manager) provides dcgmi diag for acceptance testing.
|
||||
# Adds NVIDIA's CUDA apt repository (debian12/x86_64) and installs datacenter-gpu-manager.
|
||||
|
||||
set -e
|
||||
|
||||
NVIDIA_KEYRING="/usr/share/keyrings/nvidia-cuda.gpg"
|
||||
NVIDIA_LIST="/etc/apt/sources.list.d/nvidia-cuda.list"
|
||||
NVIDIA_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub"
|
||||
NVIDIA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/"
|
||||
APT_UPDATED=0
|
||||
|
||||
mkdir -p /usr/share/keyrings /etc/apt/sources.list.d
|
||||
|
||||
ensure_tool() {
|
||||
tool="$1"
|
||||
pkg="$2"
|
||||
if command -v "${tool}" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
if [ "${APT_UPDATED}" -eq 0 ]; then
|
||||
apt-get update -qq
|
||||
APT_UPDATED=1
|
||||
fi
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}"
|
||||
}
|
||||
|
||||
ensure_cert_bundle() {
|
||||
if [ -s /etc/ssl/certs/ca-certificates.crt ]; then
|
||||
return 0
|
||||
fi
|
||||
if [ "${APT_UPDATED}" -eq 0 ]; then
|
||||
apt-get update -qq
|
||||
APT_UPDATED=1
|
||||
fi
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates
|
||||
}
|
||||
|
||||
if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then
|
||||
echo "WARN: prerequisites missing — skipping DCGM install"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Download and import NVIDIA GPG key
|
||||
if ! wget -qO- "${NVIDIA_KEY_URL}" | gpg --dearmor --yes --output "${NVIDIA_KEYRING}"; then
|
||||
echo "WARN: failed to fetch NVIDIA GPG key — skipping DCGM install"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
cat > "${NVIDIA_LIST}" <<EOF
|
||||
deb [signed-by=${NVIDIA_KEYRING}] ${NVIDIA_REPO} /
|
||||
EOF
|
||||
|
||||
apt-get update -qq
|
||||
|
||||
if DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends datacenter-gpu-manager; then
|
||||
echo "=== DCGM: datacenter-gpu-manager installed ==="
|
||||
dcgmi --version 2>/dev/null || true
|
||||
else
|
||||
echo "WARN: datacenter-gpu-manager install failed — DCGM unavailable"
|
||||
fi
|
||||
|
||||
# Clean up apt lists to keep ISO size down
|
||||
rm -f "${NVIDIA_LIST}"
|
||||
apt-get clean
|
||||
Reference in New Issue
Block a user