Add NVIDIA stress loader selection and DCGM 4 support
This commit is contained in:
@@ -48,6 +48,7 @@ sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
||||
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||
- The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
|
||||
- Override the container platform only if you know why:
|
||||
|
||||
```sh
|
||||
|
||||
@@ -23,6 +23,16 @@ RUN apt-get update -qq && apt-get install -y \
|
||||
gcc \
|
||||
make \
|
||||
perl \
|
||||
pkg-config \
|
||||
yasm \
|
||||
libssl-dev \
|
||||
zlib1g-dev \
|
||||
libbz2-dev \
|
||||
libgmp-dev \
|
||||
libpcap-dev \
|
||||
libsqlite3-dev \
|
||||
libcurl4-openssl-dev \
|
||||
ocl-icd-opencl-dev \
|
||||
linux-headers-amd64 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
@@ -8,7 +8,8 @@ NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=3.3.9
|
||||
DCGM_VERSION=4.5.2-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||
|
||||
@@ -29,6 +29,7 @@ typedef void *CUfunction;
|
||||
typedef void *CUstream;
|
||||
|
||||
#define CU_SUCCESS 0
|
||||
#define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||
|
||||
@@ -97,6 +98,9 @@ typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
|
||||
CUstream,
|
||||
void **,
|
||||
void **);
|
||||
typedef CUresult (*cuMemGetInfo_fn)(size_t *, size_t *);
|
||||
typedef CUresult (*cuStreamCreate_fn)(CUstream *, unsigned int);
|
||||
typedef CUresult (*cuStreamDestroy_fn)(CUstream);
|
||||
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
||||
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
||||
|
||||
@@ -118,6 +122,9 @@ struct cuda_api {
|
||||
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
||||
cuModuleGetFunction_fn cuModuleGetFunction;
|
||||
cuLaunchKernel_fn cuLaunchKernel;
|
||||
cuMemGetInfo_fn cuMemGetInfo;
|
||||
cuStreamCreate_fn cuStreamCreate;
|
||||
cuStreamDestroy_fn cuStreamDestroy;
|
||||
cuGetErrorName_fn cuGetErrorName;
|
||||
cuGetErrorString_fn cuGetErrorString;
|
||||
};
|
||||
@@ -128,9 +135,10 @@ struct stress_report {
|
||||
int cc_major;
|
||||
int cc_minor;
|
||||
int buffer_mb;
|
||||
int stream_count;
|
||||
unsigned long iterations;
|
||||
uint64_t checksum;
|
||||
char details[1024];
|
||||
char details[16384];
|
||||
};
|
||||
|
||||
static int load_symbol(void *lib, const char *name, void **out) {
|
||||
@@ -144,7 +152,7 @@ static int load_cuda(struct cuda_api *api) {
|
||||
if (!api->lib) {
|
||||
return 0;
|
||||
}
|
||||
return
|
||||
if (!(
|
||||
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
||||
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
||||
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
||||
@@ -160,7 +168,17 @@ static int load_cuda(struct cuda_api *api) {
|
||||
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
||||
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
||||
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
|
||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel))) {
|
||||
dlclose(api->lib);
|
||||
memset(api, 0, sizeof(*api));
|
||||
return 0;
|
||||
}
|
||||
load_symbol(api->lib, "cuMemGetInfo_v2", (void **)&api->cuMemGetInfo);
|
||||
load_symbol(api->lib, "cuStreamCreate", (void **)&api->cuStreamCreate);
|
||||
if (!load_symbol(api->lib, "cuStreamDestroy_v2", (void **)&api->cuStreamDestroy)) {
|
||||
load_symbol(api->lib, "cuStreamDestroy", (void **)&api->cuStreamDestroy);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
||||
@@ -220,6 +238,39 @@ static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *maj
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int query_multiprocessor_count(struct cuda_api *api, CUdevice dev, int *count) {
|
||||
int mp_count = 0;
|
||||
if (!check_rc(api,
|
||||
"cuDeviceGetAttribute(multiprocessors)",
|
||||
api->cuDeviceGetAttribute(&mp_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev))) {
|
||||
return 0;
|
||||
}
|
||||
*count = mp_count;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested_bytes) {
|
||||
size_t free_bytes = 0;
|
||||
size_t total_bytes = 0;
|
||||
size_t max_bytes = requested_bytes;
|
||||
|
||||
if (!api->cuMemGetInfo) {
|
||||
return requested_bytes;
|
||||
}
|
||||
if (api->cuMemGetInfo(&free_bytes, &total_bytes) != CU_SUCCESS || free_bytes == 0) {
|
||||
return requested_bytes;
|
||||
}
|
||||
|
||||
max_bytes = (free_bytes * 9u) / 10u;
|
||||
if (max_bytes < (size_t)4u * 1024u * 1024u) {
|
||||
max_bytes = (size_t)4u * 1024u * 1024u;
|
||||
}
|
||||
if (requested_bytes > max_bytes) {
|
||||
return max_bytes;
|
||||
}
|
||||
return requested_bytes;
|
||||
}
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
|
||||
size_t len = strlen(buf);
|
||||
@@ -1095,13 +1146,16 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int main(int argc, char **argv) {
|
||||
int seconds = 5;
|
||||
int size_mb = 64;
|
||||
int device_index = 0;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||
seconds = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
||||
size_mb = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||
device_index = atoi(argv[++i]);
|
||||
} else {
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
@@ -1111,6 +1165,9 @@ int main(int argc, char **argv) {
|
||||
if (size_mb <= 0) {
|
||||
size_mb = 64;
|
||||
}
|
||||
if (device_index < 0) {
|
||||
device_index = 0;
|
||||
}
|
||||
|
||||
struct cuda_api cuda;
|
||||
if (!load_cuda(&cuda)) {
|
||||
@@ -1133,8 +1190,13 @@ int main(int argc, char **argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (device_index >= count) {
|
||||
fprintf(stderr, "device index %d out of range (found %d CUDA device(s))\n", device_index, count);
|
||||
return 1;
|
||||
}
|
||||
|
||||
CUdevice dev = 0;
|
||||
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
|
||||
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, device_index))) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1162,6 +1224,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
printf("device=%s\n", report.device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
||||
printf("backend=%s\n", report.backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#!/bin/sh
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
|
||||
#
|
||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||
# - headers for compiling bee-gpu-stress against cuBLASLt
|
||||
# - headers for compiling bee-gpu-burn worker against cuBLASLt
|
||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||
|
||||
set -e
|
||||
|
||||
55
iso/builder/build-john.sh
Normal file
55
iso/builder/build-john.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/bin/sh
|
||||
# build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
|
||||
#
|
||||
# Downloads a pinned source snapshot from the official openwall/john repository,
|
||||
# builds it inside the builder container, and caches the resulting run/ tree.
|
||||
|
||||
set -e
|
||||
|
||||
JOHN_COMMIT="$1"
|
||||
DIST_DIR="$2"
|
||||
|
||||
[ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
|
||||
echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
|
||||
SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
|
||||
SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
|
||||
|
||||
if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
|
||||
echo "=== john cached, skipping build ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}"
|
||||
if [ ! -f "${SRC_TAR}" ]; then
|
||||
echo "=== downloading john source snapshot ==="
|
||||
wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
|
||||
fi
|
||||
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
|
||||
|
||||
cd "${BUILD_TMP}"
|
||||
tar xf "${SRC_TAR}"
|
||||
SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
|
||||
[ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
|
||||
|
||||
cd "${SRC_DIR}/src"
|
||||
echo "=== configuring john ==="
|
||||
./configure
|
||||
echo "=== building john ==="
|
||||
make clean >/dev/null 2>&1 || true
|
||||
make -j"$(nproc)"
|
||||
|
||||
mkdir -p "${CACHE_DIR}"
|
||||
cp -a "../run" "${CACHE_DIR}/run"
|
||||
chmod +x "${CACHE_DIR}/run/john"
|
||||
|
||||
echo "=== john build complete ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
@@ -10,7 +10,7 @@
|
||||
# Output layout:
|
||||
# $CACHE_DIR/modules/ — nvidia*.ko files
|
||||
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs
|
||||
|
||||
set -e
|
||||
|
||||
@@ -133,7 +133,14 @@ fi
|
||||
# Copy ALL userspace library files.
|
||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
count=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||
@@ -150,7 +157,14 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
|
||||
@@ -183,7 +183,7 @@ else
|
||||
fi
|
||||
|
||||
# --- NVIDIA-only build steps ---
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
@@ -196,20 +196,20 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
echo "=== building bee-gpu-burn worker ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
-o "$GPU_BURN_WORKER_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl -lm
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
echo "binary: $GPU_BURN_WORKER_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -246,6 +246,9 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
|
||||
@@ -293,9 +296,13 @@ mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee" "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${GPU_BURN_WORKER_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
|
||||
ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
fi
|
||||
|
||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||
@@ -334,6 +341,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors"
|
||||
printf 'libnvidia-opencl.so.1\n' > "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors/nvidia.icd"
|
||||
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
@@ -353,7 +362,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by the bee-gpu-burn worker tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
@@ -372,6 +381,16 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
echo ""
|
||||
echo "=== building john jumbo ${JOHN_JUMBO_COMMIT} ==="
|
||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/john"
|
||||
echo "=== john injected ==="
|
||||
fi
|
||||
|
||||
# --- embed build metadata ---
|
||||
@@ -385,7 +404,8 @@ NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||
|
||||
@@ -60,6 +60,8 @@ chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-john-gpu-stress 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Reload udev rules
|
||||
|
||||
@@ -1,2 +1,8 @@
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
|
||||
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
ocl-icd-libopencl1
|
||||
clinfo
|
||||
|
||||
93
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file
93
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=5
|
||||
SIZE_MB=64
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -x "${WORKER}" ] || { echo "bee-gpu-burn worker not found: ${WORKER}" >&2; exit 1; }
|
||||
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
echo "loader=bee-gpu-burn"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
echo "starting gpu ${id}"
|
||||
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${SIZE_MB}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
done
|
||||
|
||||
status=0
|
||||
for spec in ${WORKERS}; do
|
||||
pid=${spec%%:*}
|
||||
rest=${spec#*:}
|
||||
id=${rest%%:*}
|
||||
log=${rest#*:}
|
||||
if wait "${pid}"; then
|
||||
echo "gpu ${id} finished: OK"
|
||||
else
|
||||
rc=$?
|
||||
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||
status=1
|
||||
fi
|
||||
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||
done
|
||||
|
||||
exit "${status}"
|
||||
100
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file
100
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=300
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
FORMAT=""
|
||||
JOHN_DIR="/usr/local/lib/bee/john/run"
|
||||
JOHN_BIN="${JOHN_DIR}/john"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -x "${JOHN_BIN}" ] || { echo "john binary not found: ${JOHN_BIN}" >&2; exit 1; }
|
||||
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
JOHN_DEVICES=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
opencl_id=$((id + 1))
|
||||
if [ -z "${JOHN_DEVICES}" ]; then
|
||||
JOHN_DEVICES="${opencl_id}"
|
||||
else
|
||||
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "loader=john"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "john_devices=${JOHN_DEVICES}"
|
||||
|
||||
cd "${JOHN_DIR}"
|
||||
|
||||
choose_format() {
|
||||
if [ -n "${FORMAT}" ]; then
|
||||
echo "${FORMAT}"
|
||||
return 0
|
||||
fi
|
||||
for candidate in sha512crypt-opencl pbkdf2-hmac-sha512-opencl 7z-opencl sha256crypt-opencl md5crypt-opencl; do
|
||||
if ./john --test=1 --format="${candidate}" --devices="${JOHN_DEVICES}" >/dev/null 2>&1; then
|
||||
echo "${candidate}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
CHOSEN_FORMAT=$(choose_format) || {
|
||||
echo "no suitable john OpenCL format found" >&2
|
||||
./john --list=opencl-devices >&2 || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
exec ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${JOHN_DEVICES}"
|
||||
@@ -114,4 +114,19 @@ fi
|
||||
ldconfig 2>/dev/null || true
|
||||
log "ldconfig refreshed"
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
|
||||
# "group is empty" even when GPUs and modules are present.
|
||||
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
|
||||
if command -v nv-hostengine >/dev/null 2>&1; then
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "nv-hostengine already running — skipping"
|
||||
else
|
||||
nv-hostengine
|
||||
log "nv-hostengine started"
|
||||
fi
|
||||
else
|
||||
log "WARN: nv-hostengine not found — dcgmi diagnostics will not work"
|
||||
fi
|
||||
|
||||
log "done"
|
||||
|
||||
Reference in New Issue
Block a user