Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 |
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
|
||||
var supportBundleCommands = []struct {
|
||||
@@ -48,6 +50,43 @@ else
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi topo -m 2>&1 || true
|
||||
else
|
||||
echo "nvidia-smi not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||
`}},
|
||||
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||
for candidate in \
|
||||
/usr/bin/nvidia-fabricmanager \
|
||||
/usr/bin/nv-fabricmanager \
|
||||
/usr/bin/nvidia-fabricmanagerd \
|
||||
/usr/bin/nvlsm; do
|
||||
if [ -e "$candidate" ]; then
|
||||
echo "=== $candidate ==="
|
||||
ls -l "$candidate" 2>&1 || true
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||
echo "no fabric manager binaries found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
|
||||
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
|
||||
"bee-audit",
|
||||
"bee-web",
|
||||
"bee-sshsetup",
|
||||
"nvidia-dcgm",
|
||||
"nvidia-fabricmanager",
|
||||
}
|
||||
|
||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
DEBIAN_VERSION=12
|
||||
DEBIAN_KERNEL_ABI=auto
|
||||
NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
|
||||
125
iso/builder/build-dcgm.sh
Executable file
125
iso/builder/build-dcgm.sh
Executable file
@@ -0,0 +1,125 @@
|
||||
#!/bin/sh
|
||||
# build-dcgm.sh — pre-download DCGM and nvidia-fabricmanager .deb packages
|
||||
# from the NVIDIA CUDA apt repository (Debian 12, x86_64) on the build host,
|
||||
# then place them into config/packages.chroot/ so live-build creates a local
|
||||
# apt repository inside the chroot. This avoids requiring the NVIDIA CUDA
|
||||
# HTTPS source to be reachable from within the live-build container chroot.
|
||||
|
||||
set -e
|
||||
|
||||
DCGM_VERSION="$1"
|
||||
FABRICMANAGER_VERSION="$2"
|
||||
LB_DIR="$3"
|
||||
|
||||
[ -n "$DCGM_VERSION" ] || { echo "usage: $0 <dcgm-version> <fabricmanager-version> <lb-work-dir>"; exit 1; }
|
||||
[ -n "$FABRICMANAGER_VERSION" ] || { echo "usage: $0 <dcgm-version> <fabricmanager-version> <lb-work-dir>"; exit 1; }
|
||||
[ -n "$LB_DIR" ] || { echo "usage: $0 <dcgm-version> <fabricmanager-version> <lb-work-dir>"; exit 1; }
|
||||
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-$(dirname "$LB_DIR")/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/dcgm-downloads"
|
||||
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
||||
PKG_CHROOT_DIR="${LB_DIR}/config/packages.chroot"
|
||||
|
||||
PACKAGES="
|
||||
datacenter-gpu-manager-4-core=1:${DCGM_VERSION}
|
||||
datacenter-gpu-manager-4-cuda13=1:${DCGM_VERSION}
|
||||
datacenter-gpu-manager-4-proprietary=1:${DCGM_VERSION}
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:${DCGM_VERSION}
|
||||
nvidia-fabricmanager=${FABRICMANAGER_VERSION}
|
||||
"
|
||||
|
||||
echo "=== DCGM ${DCGM_VERSION} / nvidia-fabricmanager ${FABRICMANAGER_VERSION} ==="
|
||||
|
||||
# Check if all target .deb files are already present in packages.chroot
|
||||
all_cached=1
|
||||
for entry in $PACKAGES; do
|
||||
pkg="${entry%%=*}"
|
||||
if ! ls "${PKG_CHROOT_DIR}/${pkg}_"*.deb >/dev/null 2>&1; then
|
||||
all_cached=0
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ "$all_cached" = "1" ]; then
|
||||
echo "=== DCGM packages already in packages.chroot, skipping download ==="
|
||||
ls "${PKG_CHROOT_DIR}/datacenter-gpu-manager-4"*.deb "${PKG_CHROOT_DIR}/nvidia-fabricmanager_"*.deb 2>/dev/null || true
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${PKG_CHROOT_DIR}"
|
||||
|
||||
echo "=== downloading Packages.gz ==="
|
||||
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
|
||||
|
||||
lookup_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2"
|
||||
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
|
||||
/^Package: / { cur_pkg=$2; gsub(/\r/, "", cur_pkg) }
|
||||
/^Version: / { cur_ver=$2; gsub(/\r/, "", cur_ver) }
|
||||
/^Filename: / { cur_file=$2; gsub(/\r/, "", cur_file) }
|
||||
/^SHA256: / { cur_sha=$2; gsub(/\r/, "", cur_sha) }
|
||||
/^$/ {
|
||||
if (cur_pkg == pkg && (ver == "" || cur_ver == ver)) {
|
||||
print cur_file " " cur_sha
|
||||
printed=1
|
||||
exit
|
||||
}
|
||||
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
|
||||
}
|
||||
END {
|
||||
if (!printed && cur_pkg == pkg && (ver == "" || cur_ver == ver)) {
|
||||
print cur_file " " cur_sha
|
||||
}
|
||||
}'
|
||||
}
|
||||
|
||||
download_deb() {
|
||||
pkg="$1"
|
||||
ver="$2"
|
||||
|
||||
meta="$(lookup_pkg "$pkg" "$ver")"
|
||||
[ -n "$meta" ] || { echo "ERROR: package not found in repo: ${pkg} ${ver}"; exit 1; }
|
||||
|
||||
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
|
||||
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
|
||||
[ -n "$repo_file" ] || { echo "ERROR: filename missing for ${pkg}"; exit 1; }
|
||||
[ -n "$repo_sha" ] || { echo "ERROR: sha256 missing for ${pkg}"; exit 1; }
|
||||
|
||||
deb_name="$(basename "$repo_file")"
|
||||
cached="${DOWNLOAD_CACHE_DIR}/${deb_name}"
|
||||
|
||||
if [ -f "$cached" ]; then
|
||||
actual_sha="$(sha256sum "$cached" | awk '{print $1}')"
|
||||
if [ "$actual_sha" = "$repo_sha" ]; then
|
||||
echo "=== cached: ${deb_name} ==="
|
||||
else
|
||||
echo "=== removing stale: ${deb_name} (sha256 mismatch) ==="
|
||||
rm -f "$cached"
|
||||
wget --show-progress -O "$cached" "${REPO_BASE}/${deb_name}"
|
||||
fi
|
||||
else
|
||||
wget --show-progress -O "$cached" "${REPO_BASE}/${deb_name}"
|
||||
fi
|
||||
|
||||
actual_sha="$(sha256sum "$cached" | awk '{print $1}')"
|
||||
if [ "$actual_sha" != "$repo_sha" ]; then
|
||||
echo "ERROR: sha256 mismatch for ${deb_name}" >&2
|
||||
echo " expected: $repo_sha" >&2
|
||||
echo " actual: $actual_sha" >&2
|
||||
rm -f "$cached"
|
||||
exit 1
|
||||
fi
|
||||
echo "sha256 OK: ${deb_name}"
|
||||
|
||||
cp -f "$cached" "${PKG_CHROOT_DIR}/${deb_name}"
|
||||
}
|
||||
|
||||
for entry in $PACKAGES; do
|
||||
pkg="${entry%%=*}"
|
||||
ver="${entry#*=}"
|
||||
download_deb "$pkg" "$ver"
|
||||
done
|
||||
|
||||
echo "=== DCGM packages ready in ${PKG_CHROOT_DIR} ==="
|
||||
ls "${PKG_CHROOT_DIR}/datacenter-gpu-manager-4"*.deb "${PKG_CHROOT_DIR}/nvidia-fabricmanager_"*.deb 2>/dev/null || true
|
||||
@@ -1262,6 +1262,7 @@ fi
|
||||
# --- substitute version placeholders in package list and archive ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
sed -i \
|
||||
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
@@ -1295,6 +1296,18 @@ if [ -f "${LB_INCLUDES}/root/.ssh/authorized_keys" ]; then
|
||||
chmod 600 "${LB_INCLUDES}/root/.ssh/authorized_keys"
|
||||
fi
|
||||
|
||||
# --- pre-download NVIDIA apt packages into config/packages.chroot ---
|
||||
# live-build creates a local apt repo from config/packages.chroot/*.deb so the
|
||||
# chroot can install them without reaching the NVIDIA CUDA HTTPS source.
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
run_step "download DCGM ${DCGM_VERSION} / fabricmanager ${NVIDIA_FABRICMANAGER_VERSION} packages" \
|
||||
"25-dcgm" \
|
||||
sh "${BUILDER_DIR}/build-dcgm.sh" \
|
||||
"${DCGM_VERSION}" \
|
||||
"${NVIDIA_FABRICMANAGER_VERSION}" \
|
||||
"${LB_DIR}"
|
||||
fi
|
||||
|
||||
# --- build ISO using live-build ---
|
||||
echo ""
|
||||
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||
|
||||
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||
# Enable GPU-vendor specific services
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||
systemctl enable bee-nvidia.service
|
||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||
# explicitly.
|
||||
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
|
||||
@@ -258,6 +258,22 @@ else
|
||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||
fi
|
||||
|
||||
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||
# training completes under nvidia-fabricmanager.
|
||||
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager restarted"
|
||||
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager started"
|
||||
else
|
||||
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||
fi
|
||||
else
|
||||
log "WARN: nvidia-fabricmanager.service not installed"
|
||||
fi
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||
|
||||
Reference in New Issue
Block a user