iso: improve burn-in, export, and live boot
This commit is contained in:
@@ -4,5 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=1.0.0
|
||||
|
||||
@@ -32,6 +32,6 @@ lb config noauto \
|
||||
--memtest none \
|
||||
--iso-volume "EASY-BEE" \
|
||||
--iso-application "EASY-BEE" \
|
||||
--bootappend-live "boot=live components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live toram components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--apt-recommends false \
|
||||
"${@}"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
170
iso/builder/build-cublas.sh
Normal file
170
iso/builder/build-cublas.sh
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/bin/sh
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
|
||||
#
|
||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||
# - headers for compiling bee-gpu-stress against cuBLASLt
|
||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||
|
||||
set -e
|
||||
|
||||
CUBLAS_VERSION="$1"
|
||||
CUDA_USERSPACE_VERSION="$2"
|
||||
CUDA_SERIES="$3"
|
||||
DIST_DIR="$4"
|
||||
|
||||
[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$CUDA_SERIES" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
|
||||
CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||
CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
|
||||
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
||||
|
||||
echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
|
||||
|
||||
if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
|
||||
&& [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== cuBLAS cached, skipping download ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
|
||||
|
||||
echo "=== downloading Packages.gz ==="
|
||||
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
|
||||
|
||||
lookup_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2"
|
||||
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
|
||||
/^Package: / { cur_pkg=$2 }
|
||||
/^Version: / { cur_ver=$2 }
|
||||
/^Filename: / { cur_file=$2 }
|
||||
/^SHA256: / { cur_sha=$2 }
|
||||
/^$/ {
|
||||
if (cur_pkg == pkg && cur_ver == ver) {
|
||||
print cur_file " " cur_sha
|
||||
exit
|
||||
}
|
||||
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
|
||||
}
|
||||
END {
|
||||
if (cur_pkg == pkg && cur_ver == ver) {
|
||||
print cur_file " " cur_sha
|
||||
}
|
||||
}'
|
||||
}
|
||||
|
||||
download_verified_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2"
|
||||
|
||||
meta="$(lookup_pkg "$pkg" "$ver")"
|
||||
[ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
|
||||
|
||||
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
|
||||
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
|
||||
[ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
|
||||
[ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
|
||||
|
||||
out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
|
||||
if [ -f "$out" ]; then
|
||||
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||
if [ "$actual_sha" = "$repo_sha" ]; then
|
||||
echo "=== using cached $(basename "$repo_file") ==="
|
||||
printf '%s\n' "$out"
|
||||
return 0
|
||||
fi
|
||||
echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ==="
|
||||
rm -f "$out"
|
||||
fi
|
||||
|
||||
echo "=== downloading $(basename "$repo_file") ==="
|
||||
wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
|
||||
|
||||
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||
if [ "$actual_sha" != "$repo_sha" ]; then
|
||||
echo "ERROR: sha256 mismatch for $(basename "$repo_file")"
|
||||
echo " expected: $repo_sha"
|
||||
echo " actual: $actual_sha"
|
||||
rm -f "$out"
|
||||
exit 1
|
||||
fi
|
||||
echo "sha256 OK: $(basename "$repo_file")"
|
||||
printf '%s\n' "$out"
|
||||
}
|
||||
|
||||
extract_deb() {
|
||||
deb="$1"
|
||||
dst="$2"
|
||||
mkdir -p "$dst"
|
||||
(
|
||||
cd "$dst"
|
||||
ar x "$deb"
|
||||
data_tar=$(ls data.tar.* 2>/dev/null | head -1)
|
||||
[ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
|
||||
tar xf "$data_tar"
|
||||
)
|
||||
}
|
||||
|
||||
copy_headers() {
|
||||
from="$1"
|
||||
if [ -d "${from}/usr/include" ]; then
|
||||
cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
|
||||
fi
|
||||
}
|
||||
|
||||
copy_libs() {
|
||||
from="$1"
|
||||
find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
|
||||
\( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||
}
|
||||
|
||||
make_links() {
|
||||
base="$1"
|
||||
versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
|
||||
[ -n "$versioned" ] || return 0
|
||||
soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
|
||||
target=$(basename "$versioned")
|
||||
ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
|
||||
ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
|
||||
}
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
|
||||
|
||||
CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||
CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||
CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||
CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||
|
||||
extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
|
||||
extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
|
||||
extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
|
||||
extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
|
||||
|
||||
copy_headers "${TMP_DIR}/cublas-dev"
|
||||
copy_headers "${TMP_DIR}/cudart-dev"
|
||||
copy_libs "${TMP_DIR}/cublas-rt"
|
||||
copy_libs "${TMP_DIR}/cudart-rt"
|
||||
|
||||
make_links "libcublas"
|
||||
make_links "libcublasLt"
|
||||
make_links "libcudart"
|
||||
|
||||
[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
|
||||
[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
|
||||
|
||||
echo "=== cuBLAS extraction complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
|
||||
echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"
|
||||
@@ -7,6 +7,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
||||
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
AUTH_KEYS=""
|
||||
REBUILD_IMAGE=0
|
||||
@@ -40,6 +41,13 @@ if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PLATFORM_OS="${BUILDER_PLATFORM%/*}"
|
||||
PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
|
||||
if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
|
||||
echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
[ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
|
||||
AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
|
||||
@@ -56,17 +64,35 @@ mkdir -p \
|
||||
|
||||
IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
|
||||
|
||||
if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
image_matches_platform() {
|
||||
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
|
||||
[ "$actual_platform" = "${BUILDER_PLATFORM}" ]
|
||||
}
|
||||
|
||||
NEED_BUILD_IMAGE=0
|
||||
if [ "$REBUILD_IMAGE" = "1" ]; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! image_matches_platform; then
|
||||
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
|
||||
echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
|
||||
NEED_BUILD_IMAGE=1
|
||||
fi
|
||||
|
||||
if [ "$NEED_BUILD_IMAGE" = "1" ]; then
|
||||
"$CONTAINER_TOOL" build \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
--build-arg GO_VERSION="${GO_VERSION}" \
|
||||
-t "${IMAGE_REF}" \
|
||||
"${BUILDER_DIR}"
|
||||
else
|
||||
echo "=== using existing builder image ${IMAGE_REF} ==="
|
||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||
fi
|
||||
|
||||
set -- \
|
||||
run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
@@ -80,6 +106,7 @@ set -- \
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
set -- run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
|
||||
@@ -159,6 +159,16 @@ else
|
||||
echo "=== bee binary up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
@@ -167,6 +177,7 @@ fi
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl
|
||||
@@ -283,6 +294,10 @@ NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- embed build metadata ---
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||
@@ -297,6 +312,8 @@ DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
EOF
|
||||
|
||||
# Patch motd with build info
|
||||
|
||||
@@ -20,6 +20,7 @@ openssh-server
|
||||
|
||||
# Filesystem support for USB export targets
|
||||
exfatprogs
|
||||
exfat-fuse
|
||||
ntfs-3g
|
||||
|
||||
# Utilities
|
||||
|
||||
Reference in New Issue
Block a user