From ace1a9dba6af8524a623b267d3aec00268b07c7f Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Mon, 30 Mar 2026 22:24:37 +0300 Subject: [PATCH] feat(iso): split into nvidia and amd variants, fix KVM graphics and PATH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - build.sh: add --variant nvidia|amd; separate work dirs per variant (live-build-work-nvidia / live-build-work-amd); GPU-specific steps (modules, NCCL, cuBLAS, nccl-tests) run only for nvidia; deb package cache synced back to shared location after each lb build so second variant reuses downloaded packages; ISO output named easy-bee-{variant}-v{ver}-amd64.iso - build-in-container.sh: add --variant nvidia|amd|all (default: all); runs build.sh twice in one container for 'all'; --clean-build wipes both variant work dirs - package-lists: remove GPU packages from bee.list.chroot; add bee-nvidia.list.chroot (DCGM) and bee-amd.list.chroot (ROCm) - 9000-bee-setup hook: read /etc/bee-gpu-vendor; enable bee-nvidia.service and DCGM only for nvidia; set up ROCm symlinks only for amd - auto/config: --iso-volume uses BEE_GPU_VENDOR_UPPER env var - grub.cfg: add nomodeset to EASY-BEE and EASY-BEE (load to RAM) entries — fixes X/lightdm on BMC KVM (ASPEED AST chip requires nomodeset for fbdev to work; NVIDIA H100 compute does not need KMS) - bee.sh / smoketest.sh: add /usr/sbin to PATH so dmidecode, smartctl, nvme are found - 9100-memtest hook: add diagnostic listing of chroot/boot/memtest* files Co-Authored-By: Claude Sonnet 4.6 --- iso/builder/auto/config | 4 +- iso/builder/build-in-container.sh | 102 ++++-- iso/builder/build.sh | 302 +++++++++++------- .../config/bootloaders/grub-pc/grub.cfg | 4 +- .../hooks/normal/9000-bee-setup.hook.chroot | 35 +- .../hooks/normal/9100-memtest.hook.binary | 3 + .../config/package-lists/bee-amd.list.chroot | 9 + .../package-lists/bee-nvidia.list.chroot | 2 + .../config/package-lists/bee.list.chroot | 13 - iso/builder/smoketest.sh | 2 +- iso/overlay/etc/profile.d/bee.sh | 2 +- 11 files changed, 305 insertions(+), 173 deletions(-) create mode 100644 iso/builder/config/package-lists/bee-amd.list.chroot create mode 100644 iso/builder/config/package-lists/bee-nvidia.list.chroot diff --git a/iso/builder/auto/config b/iso/builder/auto/config index 5569221..85fdcbf 100755 --- a/iso/builder/auto/config +++ b/iso/builder/auto/config @@ -30,8 +30,8 @@ lb config noauto \ --linux-flavours "amd64" \ --linux-packages "${LB_LINUX_PACKAGES}" \ --memtest none \ - --iso-volume "EASY-BEE" \ - --iso-application "EASY-BEE" \ + --iso-volume "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ + --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --apt-recommends false \ --chroot-squashfs-compression-type zstd \ diff --git a/iso/builder/build-in-container.sh b/iso/builder/build-in-container.sh index 176fa64..9f33778 100755 --- a/iso/builder/build-in-container.sh +++ b/iso/builder/build-in-container.sh @@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}" AUTH_KEYS="" REBUILD_IMAGE=0 CLEAN_CACHE=0 +VARIANT="all" . "${BUILDER_DIR}/VERSIONS" @@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do REBUILD_IMAGE=1 shift ;; + --variant) + VARIANT="$2" + shift 2 + ;; *) echo "unknown arg: $1" >&2 - echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2 + echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2 exit 1 ;; esac done +case "$VARIANT" in + nvidia|amd|all) ;; + *) echo "unknown variant: $VARIANT (expected nvidia, amd, or all)" >&2; exit 1 ;; +esac + if [ "$CLEAN_CACHE" = "1" ]; then echo "=== cleaning build cache: ${CACHE_DIR} ===" rm -rf "${CACHE_DIR:?}/go-build" \ @@ -49,8 +59,9 @@ if [ "$CLEAN_CACHE" = "1" ]; then "${CACHE_DIR:?}/tmp" \ "${CACHE_DIR:?}/bee" \ "${CACHE_DIR:?}/lb-packages" - echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ===" - rm -rf "${REPO_ROOT}/dist/live-build-work" + echo "=== cleaning live-build work dirs ===" + rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia" + rm -rf "${REPO_ROOT}/dist/live-build-work-amd" echo "=== caches cleared, proceeding with build ===" fi @@ -108,34 +119,71 @@ else echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ===" fi -set -- \ - run --rm --privileged \ - --platform "${BUILDER_PLATFORM}" \ - -v "${REPO_ROOT}:/work" \ - -v "${CACHE_DIR}:/cache" \ - -e BEE_CONTAINER_BUILD=1 \ - -e GOCACHE=/cache/go-build \ - -e GOMODCACHE=/cache/go-mod \ - -e TMPDIR=/cache/tmp \ - -e BEE_CACHE_DIR=/cache/bee \ - -w /work \ - "${IMAGE_REF}" \ - sh /work/iso/builder/build.sh - -if [ -n "$AUTH_KEYS" ]; then - set -- run --rm --privileged \ - --platform "${BUILDER_PLATFORM}" \ - -v "${REPO_ROOT}:/work" \ - -v "${CACHE_DIR}:/cache" \ - -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \ +# Build base docker run args (without --authorized-keys) +build_run_args() { + _variant="$1" + _auth_arg="" + if [ -n "$AUTH_KEYS" ]; then + _auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}" + fi + echo "run --rm --privileged \ + --platform ${BUILDER_PLATFORM} \ + -v ${REPO_ROOT}:/work \ + -v ${CACHE_DIR}:/cache \ + ${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \ -e BEE_CONTAINER_BUILD=1 \ -e GOCACHE=/cache/go-build \ -e GOMODCACHE=/cache/go-mod \ -e TMPDIR=/cache/tmp \ -e BEE_CACHE_DIR=/cache/bee \ -w /work \ - "${IMAGE_REF}" \ - sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}" -fi + ${IMAGE_REF} \ + sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}" +} -"$CONTAINER_TOOL" "$@" +run_variant() { + _v="$1" + echo "=== building variant: ${_v} ===" + if [ -n "$AUTH_KEYS" ]; then + "$CONTAINER_TOOL" run --rm --privileged \ + --platform "${BUILDER_PLATFORM}" \ + -v "${REPO_ROOT}:/work" \ + -v "${CACHE_DIR}:/cache" \ + -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \ + -e BEE_CONTAINER_BUILD=1 \ + -e GOCACHE=/cache/go-build \ + -e GOMODCACHE=/cache/go-mod \ + -e TMPDIR=/cache/tmp \ + -e BEE_CACHE_DIR=/cache/bee \ + -w /work \ + "${IMAGE_REF}" \ + sh /work/iso/builder/build.sh --variant "${_v}" \ + --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}" + else + "$CONTAINER_TOOL" run --rm --privileged \ + --platform "${BUILDER_PLATFORM}" \ + -v "${REPO_ROOT}:/work" \ + -v "${CACHE_DIR}:/cache" \ + -e BEE_CONTAINER_BUILD=1 \ + -e GOCACHE=/cache/go-build \ + -e GOMODCACHE=/cache/go-mod \ + -e TMPDIR=/cache/tmp \ + -e BEE_CACHE_DIR=/cache/bee \ + -w /work \ + "${IMAGE_REF}" \ + sh /work/iso/builder/build.sh --variant "${_v}" + fi +} + +case "$VARIANT" in + nvidia) + run_variant nvidia + ;; + amd) + run_variant amd + ;; + all) + run_variant nvidia + run_variant amd + ;; +esac diff --git a/iso/builder/build.sh b/iso/builder/build.sh index 88a175a..3a7541a 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -13,19 +13,29 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder" OVERLAY_DIR="${REPO_ROOT}/iso/overlay" DIST_DIR="${REPO_ROOT}/dist" VENDOR_DIR="${REPO_ROOT}/iso/vendor" -BUILD_WORK_DIR="${DIST_DIR}/live-build-work" -OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage" CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}" AUTH_KEYS="" +BEE_GPU_VENDOR="nvidia" # parse args while [ $# -gt 0 ]; do case "$1" in --authorized-keys) AUTH_KEYS="$2"; shift 2 ;; + --variant) BEE_GPU_VENDOR="$2"; shift 2 ;; *) echo "unknown arg: $1"; exit 1 ;; esac done +case "$BEE_GPU_VENDOR" in + nvidia|amd) ;; + *) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia or amd)" >&2; exit 1 ;; +esac + +BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}" +OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}" + +export BEE_GPU_VENDOR + . "${BUILDER_DIR}/VERSIONS" export PATH="$PATH:/usr/local/go/bin" @@ -132,7 +142,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then apt-get install -y "linux-headers-${KVER}" fi -echo "=== bee ISO build ===" +echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ===" echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}" echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}" echo "" @@ -141,8 +151,8 @@ echo "=== syncing git submodules ===" git -C "${REPO_ROOT}" submodule update --init --recursive # --- compile bee binary (static, Linux amd64) --- +# Shared between variants — built once, reused on second pass. BEE_BIN="${DIST_DIR}/bee-linux-amd64" -GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64" NEED_BUILD=1 if [ -f "$BEE_BIN" ]; then NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1) @@ -172,37 +182,41 @@ else echo "=== bee binary up to date, skipping build ===" fi -echo "" -echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ===" -sh "${BUILDER_DIR}/build-cublas.sh" \ - "${CUBLAS_VERSION}" \ - "${CUDA_USERSPACE_VERSION}" \ - "${NCCL_CUDA_VERSION}" \ - "${DIST_DIR}" +# --- NVIDIA-only build steps --- +GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64" +if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then + echo "" + echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ===" + sh "${BUILDER_DIR}/build-cublas.sh" \ + "${CUBLAS_VERSION}" \ + "${CUDA_USERSPACE_VERSION}" \ + "${NCCL_CUDA_VERSION}" \ + "${DIST_DIR}" -CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}" + CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}" -GPU_STRESS_NEED_BUILD=1 -if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then - GPU_STRESS_NEED_BUILD=0 + GPU_STRESS_NEED_BUILD=1 + if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then + GPU_STRESS_NEED_BUILD=0 + fi + + if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then + echo "=== building bee-gpu-stress ===" + gcc -O2 -s -Wall -Wextra \ + -I"${CUBLAS_CACHE}/include" \ + -o "$GPU_STRESS_BIN" \ + "${BUILDER_DIR}/bee-gpu-stress.c" \ + -ldl -lm + echo "binary: $GPU_STRESS_BIN" + else + echo "=== bee-gpu-stress up to date, skipping build ===" + fi fi -if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then - echo "=== building bee-gpu-stress ===" - gcc -O2 -s -Wall -Wextra \ - -I"${CUBLAS_CACHE}/include" \ - -o "$GPU_STRESS_BIN" \ - "${BUILDER_DIR}/bee-gpu-stress.c" \ - -ldl -lm - echo "binary: $GPU_STRESS_BIN" -else - echo "=== bee-gpu-stress up to date, skipping build ===" -fi - -echo "=== preparing staged overlay ===" -# Sync builder config into work dir, preserving lb cache (chroot + packages). -# We do NOT rm -rf BUILD_WORK_DIR so lb can reuse its chroot on repeat builds. +echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ===" mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}" + +# Sync builder config into variant work dir, preserving lb cache. rsync -a --delete \ --exclude='cache/' \ --exclude='chroot/' \ @@ -212,7 +226,10 @@ rsync -a --delete \ --exclude='*.contents' \ --exclude='*.files' \ "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/" -# Also persist package cache to CACHE_ROOT so it survives a manual wipe of BUILD_WORK_DIR. + +# Share deb package cache across variants. +# Restore: populate work dir cache from shared cache before build. +# Persist: sync back after build (done after lb build below). LB_PKG_CACHE="${CACHE_ROOT}/lb-packages" mkdir -p "${LB_PKG_CACHE}" if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then @@ -221,6 +238,7 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot" rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/" fi + rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/" rm -f \ "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \ @@ -231,6 +249,12 @@ rm -f \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" +# Remove NVIDIA-specific overlay files for AMD variant +if [ "$BEE_GPU_VENDOR" = "amd" ]; then + rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load" + rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service" +fi + # --- inject authorized_keys for SSH access --- AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh" @@ -268,8 +292,11 @@ fi mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" -cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" -chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" + +if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then + cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" + chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" +fi # --- inject smoketest into overlay so it runs directly on the live CD --- cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" @@ -286,107 +313,143 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do fi done -# --- build NVIDIA kernel modules --- -echo "" -echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ===" -sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" +# --- NVIDIA kernel modules and userspace libs --- +if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then + echo "" + echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ===" + sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" -KVER="${DEBIAN_KERNEL_ABI}-amd64" -NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}" + KVER="${DEBIAN_KERNEL_ABI}-amd64" + NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}" -# Inject .ko files into overlay at /usr/local/lib/nvidia/ -OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia" -OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia" -mkdir -p "${OVERLAY_KMOD_DIR}" -cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/" + # Inject .ko files into overlay at /usr/local/lib/nvidia/ + OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia" + mkdir -p "${OVERLAY_KMOD_DIR}" + cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/" -# Inject nvidia-smi and libnvidia-ml -mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib" -cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/" -chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi" -cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true -chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true -cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true + # Inject nvidia-smi and libnvidia-ml + mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib" + cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/" + chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi" + cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true + chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true + cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true -# Inject GSP firmware into /lib/firmware/nvidia// -if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then - mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}" - cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" - echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ===" + # Inject GSP firmware into /lib/firmware/nvidia// + if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then + mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}" + cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" + echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ===" + fi + + # --- build / download NCCL --- + echo "" + echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ===" + sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}" + + NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" + + # Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs + cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" + echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" + + # Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path + cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" + echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" + + # --- build nccl-tests --- + echo "" + echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ===" + sh "${BUILDER_DIR}/build-nccl-tests.sh" \ + "${NCCL_TESTS_VERSION}" \ + "${NCCL_VERSION}" \ + "${NCCL_CUDA_VERSION}" \ + "${DIST_DIR}" \ + "${NVCC_VERSION}" \ + "${DEBIAN_VERSION}" + + NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}" + cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" + chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" + echo "=== all_reduce_perf injected ===" fi -# --- build / download NCCL --- -echo "" -echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ===" -sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}" - -NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" - -# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs -cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" -echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" - -# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path -cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" -echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" - -# --- build nccl-tests --- -echo "" -echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ===" -sh "${BUILDER_DIR}/build-nccl-tests.sh" \ - "${NCCL_TESTS_VERSION}" \ - "${NCCL_VERSION}" \ - "${NCCL_CUDA_VERSION}" \ - "${DIST_DIR}" \ - "${NVCC_VERSION}" \ - "${DEBIAN_VERSION}" - -NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}" -cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" -chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" -echo "=== all_reduce_perf injected ===" - # --- embed build metadata --- mkdir -p "${OVERLAY_STAGE_DIR}/etc" BUILD_DATE="$(date +%Y-%m-%d)" GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)" -cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" < "${OVERLAY_STAGE_DIR}/etc/bee-release" < "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor" + # Patch motd with build info -BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}" +BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}" if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \ > "${OVERLAY_STAGE_DIR}/etc/motd.patched" mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd" fi -# --- substitute version placeholders in package list --- -sed -i \ - -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \ - -e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \ - -e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \ - -e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \ - -e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \ - -e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \ - -e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \ - -e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \ - -e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \ - -e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \ - "${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \ - "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" +# --- copy variant-specific package list into work dir --- +cp "${BUILD_WORK_DIR}/config/package-lists/bee-${BEE_GPU_VENDOR}.list.chroot" \ + "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot" + +# --- remove archives for the other vendor --- +if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then + rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \ + "${BUILD_WORK_DIR}/config/archives/rocm.key.chroot" +else + rm -f "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \ + "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot" +fi + +# --- substitute version placeholders in package list and archive --- +if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then + sed -i \ + -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \ + "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot" +else + sed -i \ + -e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \ + -e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \ + -e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \ + -e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \ + -e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \ + -e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \ + -e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \ + -e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \ + -e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \ + "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot" + if [ -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" ]; then + sed -i \ + -e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \ + "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" + fi +fi # --- sync overlay into live-build includes.chroot --- LB_DIR="${BUILD_WORK_DIR}" @@ -402,20 +465,31 @@ fi # --- build ISO using live-build --- echo "" -echo "=== building ISO (live-build) ===" +echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ===" + +# Export for auto/config +BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')" +export BEE_GPU_VENDOR_UPPER cd "${LB_DIR}" lb clean 2>&1 | tail -3 lb config 2>&1 | tail -5 lb build 2>&1 +# --- persist deb package cache back to shared location --- +# This allows the second variant to reuse all downloaded packages. +if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then + rsync -a "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/" + echo "=== package cache synced to ${LB_PKG_CACHE} ===" +fi + # live-build outputs live-image-amd64.hybrid.iso in LB_DIR ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso" -ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso" +ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso" if [ -f "$ISO_RAW" ]; then cp "$ISO_RAW" "$ISO_OUT" echo "" - echo "=== done ===" + echo "=== done (${BEE_GPU_VENDOR}) ===" echo "ISO: $ISO_OUT" if command -v stat >/dev/null 2>&1; then ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")" diff --git a/iso/builder/config/bootloaders/grub-pc/grub.cfg b/iso/builder/config/bootloaders/grub-pc/grub.cfg index 97823e1..3d33e47 100644 --- a/iso/builder/config/bootloaders/grub-pc/grub.cfg +++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg @@ -10,12 +10,12 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝ echo "" menuentry "EASY-BEE" { - linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup + linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup initrd @INITRD_LIVE@ } menuentry "EASY-BEE (load to RAM)" { - linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup + linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup initrd @INITRD_LIVE@ } diff --git a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot index 942d4bd..a1db962 100755 --- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot +++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot @@ -5,6 +5,9 @@ set -e echo "=== bee chroot setup ===" +GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia) +echo "=== GPU vendor: ${GPU_VENDOR} ===" + ensure_bee_console_user() { if id bee >/dev/null 2>&1; then usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true @@ -21,10 +24,8 @@ ensure_bee_console_user() { ensure_bee_console_user -# Enable bee services -systemctl enable nvidia-dcgm.service 2>/dev/null || true +# Enable common bee services systemctl enable bee-network.service -systemctl enable bee-nvidia.service systemctl enable bee-preflight.service systemctl enable bee-audit.service systemctl enable bee-web.service @@ -36,25 +37,33 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true systemctl enable serial-getty@ttyS1.service 2>/dev/null || true systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true +# Enable GPU-vendor specific services +if [ "$GPU_VENDOR" = "nvidia" ]; then + systemctl enable nvidia-dcgm.service 2>/dev/null || true + systemctl enable bee-nvidia.service +elif [ "$GPU_VENDOR" = "amd" ]; then + # ROCm symlinks (packages install to /opt/rocm-*/bin/) + for tool in rocm-smi rocm-bandwidth-test rvs; do + if [ ! -e /usr/local/bin/${tool} ]; then + bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)" + [ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool} + fi + done +fi + # Ensure scripts are executable chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true -chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true chmod +x /usr/local/bin/bee 2>/dev/null || true chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true +if [ "$GPU_VENDOR" = "nvidia" ]; then + chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true +fi # Reload udev rules udevadm control --reload-rules 2>/dev/null || true -# rocm symlinks (packages install to /opt/rocm-*/bin/) -for tool in rocm-smi rocm-bandwidth-test rvs; do - if [ ! -e /usr/local/bin/${tool} ]; then - bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)" - [ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool} - fi -done - # Create export directory mkdir -p /appdata/bee/export @@ -62,4 +71,4 @@ if [ -f /etc/sudoers.d/bee ]; then chmod 0440 /etc/sudoers.d/bee fi -echo "=== bee chroot setup complete ===" +echo "=== bee chroot setup complete (${GPU_VENDOR}) ===" diff --git a/iso/builder/config/hooks/normal/9100-memtest.hook.binary b/iso/builder/config/hooks/normal/9100-memtest.hook.binary index 4805d56..2811659 100755 --- a/iso/builder/config/hooks/normal/9100-memtest.hook.binary +++ b/iso/builder/config/hooks/normal/9100-memtest.hook.binary @@ -4,6 +4,9 @@ # not inside the squashfs). set -e +echo "memtest: scanning chroot/boot/ for memtest files:" +ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files found in chroot/boot/" + for f in memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi; do src="chroot/boot/${f}" if [ -f "${src}" ]; then diff --git a/iso/builder/config/package-lists/bee-amd.list.chroot b/iso/builder/config/package-lists/bee-amd.list.chroot new file mode 100644 index 0000000..f7cefca --- /dev/null +++ b/iso/builder/config/package-lists/bee-amd.list.chroot @@ -0,0 +1,9 @@ +# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST) +rocm-smi-lib=%%ROCM_SMI_VERSION%% +rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%% +rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%% +rocblas=%%ROCBLAS_VERSION%% +rocrand=%%ROCRAND_VERSION%% +hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%% +hipblaslt=%%HIPBLASLT_VERSION%% +comgr=%%COMGR_VERSION%% diff --git a/iso/builder/config/package-lists/bee-nvidia.list.chroot b/iso/builder/config/package-lists/bee-nvidia.list.chroot new file mode 100644 index 0000000..df28777 --- /dev/null +++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot @@ -0,0 +1,2 @@ +# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing +datacenter-gpu-manager=1:%%DCGM_VERSION%% diff --git a/iso/builder/config/package-lists/bee.list.chroot b/iso/builder/config/package-lists/bee.list.chroot index a4304db..b9d7816 100644 --- a/iso/builder/config/package-lists/bee.list.chroot +++ b/iso/builder/config/package-lists/bee.list.chroot @@ -72,18 +72,5 @@ firmware-bnx2x firmware-cavium firmware-qlogic -# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing -datacenter-gpu-manager=1:%%DCGM_VERSION%% - -# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST) -rocm-smi-lib=%%ROCM_SMI_VERSION%% -rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%% -rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%% -rocblas=%%ROCBLAS_VERSION%% -rocrand=%%ROCRAND_VERSION%% -hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%% -hipblaslt=%%HIPBLASLT_VERSION%% -comgr=%%COMGR_VERSION%% - # glibc compat helpers (for any external binaries that need it) libc6 diff --git a/iso/builder/smoketest.sh b/iso/builder/smoketest.sh index 7a88bf1..9f0d78c 100644 --- a/iso/builder/smoketest.sh +++ b/iso/builder/smoketest.sh @@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}" # --- PATH & binaries --- echo "-- PATH & binaries --" for tool in dmidecode smartctl nvme ipmitool lspci bee; do - if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then + if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then ok "$tool found: $p" else fail "$tool: NOT FOUND" diff --git a/iso/overlay/etc/profile.d/bee.sh b/iso/overlay/etc/profile.d/bee.sh index e9519be..4aadf7d 100644 --- a/iso/overlay/etc/profile.d/bee.sh +++ b/iso/overlay/etc/profile.d/bee.sh @@ -1,4 +1,4 @@ -export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin" +export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin" # Print web UI URLs on the local console at login. if [ -z "${SSH_CONNECTION:-}" ] \