Add NVIDIA stress loader selection and DCGM 4 support
This commit is contained in:
@@ -23,6 +23,16 @@ RUN apt-get update -qq && apt-get install -y \
|
||||
gcc \
|
||||
make \
|
||||
perl \
|
||||
pkg-config \
|
||||
yasm \
|
||||
libssl-dev \
|
||||
zlib1g-dev \
|
||||
libbz2-dev \
|
||||
libgmp-dev \
|
||||
libpcap-dev \
|
||||
libsqlite3-dev \
|
||||
libcurl4-openssl-dev \
|
||||
ocl-icd-opencl-dev \
|
||||
linux-headers-amd64 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
@@ -8,7 +8,8 @@ NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=3.3.9
|
||||
DCGM_VERSION=4.5.2-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||
|
||||
@@ -29,6 +29,7 @@ typedef void *CUfunction;
|
||||
typedef void *CUstream;
|
||||
|
||||
#define CU_SUCCESS 0
|
||||
#define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||
|
||||
@@ -97,6 +98,9 @@ typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
|
||||
CUstream,
|
||||
void **,
|
||||
void **);
|
||||
typedef CUresult (*cuMemGetInfo_fn)(size_t *, size_t *);
|
||||
typedef CUresult (*cuStreamCreate_fn)(CUstream *, unsigned int);
|
||||
typedef CUresult (*cuStreamDestroy_fn)(CUstream);
|
||||
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
||||
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
||||
|
||||
@@ -118,6 +122,9 @@ struct cuda_api {
|
||||
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
||||
cuModuleGetFunction_fn cuModuleGetFunction;
|
||||
cuLaunchKernel_fn cuLaunchKernel;
|
||||
cuMemGetInfo_fn cuMemGetInfo;
|
||||
cuStreamCreate_fn cuStreamCreate;
|
||||
cuStreamDestroy_fn cuStreamDestroy;
|
||||
cuGetErrorName_fn cuGetErrorName;
|
||||
cuGetErrorString_fn cuGetErrorString;
|
||||
};
|
||||
@@ -128,9 +135,10 @@ struct stress_report {
|
||||
int cc_major;
|
||||
int cc_minor;
|
||||
int buffer_mb;
|
||||
int stream_count;
|
||||
unsigned long iterations;
|
||||
uint64_t checksum;
|
||||
char details[1024];
|
||||
char details[16384];
|
||||
};
|
||||
|
||||
static int load_symbol(void *lib, const char *name, void **out) {
|
||||
@@ -144,7 +152,7 @@ static int load_cuda(struct cuda_api *api) {
|
||||
if (!api->lib) {
|
||||
return 0;
|
||||
}
|
||||
return
|
||||
if (!(
|
||||
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
||||
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
||||
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
||||
@@ -160,7 +168,17 @@ static int load_cuda(struct cuda_api *api) {
|
||||
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
||||
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
||||
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
|
||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel))) {
|
||||
dlclose(api->lib);
|
||||
memset(api, 0, sizeof(*api));
|
||||
return 0;
|
||||
}
|
||||
load_symbol(api->lib, "cuMemGetInfo_v2", (void **)&api->cuMemGetInfo);
|
||||
load_symbol(api->lib, "cuStreamCreate", (void **)&api->cuStreamCreate);
|
||||
if (!load_symbol(api->lib, "cuStreamDestroy_v2", (void **)&api->cuStreamDestroy)) {
|
||||
load_symbol(api->lib, "cuStreamDestroy", (void **)&api->cuStreamDestroy);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
||||
@@ -220,6 +238,39 @@ static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *maj
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int query_multiprocessor_count(struct cuda_api *api, CUdevice dev, int *count) {
|
||||
int mp_count = 0;
|
||||
if (!check_rc(api,
|
||||
"cuDeviceGetAttribute(multiprocessors)",
|
||||
api->cuDeviceGetAttribute(&mp_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev))) {
|
||||
return 0;
|
||||
}
|
||||
*count = mp_count;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested_bytes) {
|
||||
size_t free_bytes = 0;
|
||||
size_t total_bytes = 0;
|
||||
size_t max_bytes = requested_bytes;
|
||||
|
||||
if (!api->cuMemGetInfo) {
|
||||
return requested_bytes;
|
||||
}
|
||||
if (api->cuMemGetInfo(&free_bytes, &total_bytes) != CU_SUCCESS || free_bytes == 0) {
|
||||
return requested_bytes;
|
||||
}
|
||||
|
||||
max_bytes = (free_bytes * 9u) / 10u;
|
||||
if (max_bytes < (size_t)4u * 1024u * 1024u) {
|
||||
max_bytes = (size_t)4u * 1024u * 1024u;
|
||||
}
|
||||
if (requested_bytes > max_bytes) {
|
||||
return max_bytes;
|
||||
}
|
||||
return requested_bytes;
|
||||
}
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
|
||||
size_t len = strlen(buf);
|
||||
@@ -1095,13 +1146,16 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int main(int argc, char **argv) {
|
||||
int seconds = 5;
|
||||
int size_mb = 64;
|
||||
int device_index = 0;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||
seconds = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
||||
size_mb = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||
device_index = atoi(argv[++i]);
|
||||
} else {
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
@@ -1111,6 +1165,9 @@ int main(int argc, char **argv) {
|
||||
if (size_mb <= 0) {
|
||||
size_mb = 64;
|
||||
}
|
||||
if (device_index < 0) {
|
||||
device_index = 0;
|
||||
}
|
||||
|
||||
struct cuda_api cuda;
|
||||
if (!load_cuda(&cuda)) {
|
||||
@@ -1133,8 +1190,13 @@ int main(int argc, char **argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (device_index >= count) {
|
||||
fprintf(stderr, "device index %d out of range (found %d CUDA device(s))\n", device_index, count);
|
||||
return 1;
|
||||
}
|
||||
|
||||
CUdevice dev = 0;
|
||||
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
|
||||
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, device_index))) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1162,6 +1224,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
printf("device=%s\n", report.device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
||||
printf("backend=%s\n", report.backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#!/bin/sh
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
|
||||
#
|
||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||
# - headers for compiling bee-gpu-stress against cuBLASLt
|
||||
# - headers for compiling bee-gpu-burn worker against cuBLASLt
|
||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||
|
||||
set -e
|
||||
|
||||
55
iso/builder/build-john.sh
Normal file
55
iso/builder/build-john.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/bin/sh
|
||||
# build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
|
||||
#
|
||||
# Downloads a pinned source snapshot from the official openwall/john repository,
|
||||
# builds it inside the builder container, and caches the resulting run/ tree.
|
||||
|
||||
set -e
|
||||
|
||||
JOHN_COMMIT="$1"
|
||||
DIST_DIR="$2"
|
||||
|
||||
[ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
|
||||
echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
|
||||
SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
|
||||
SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
|
||||
|
||||
if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
|
||||
echo "=== john cached, skipping build ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}"
|
||||
if [ ! -f "${SRC_TAR}" ]; then
|
||||
echo "=== downloading john source snapshot ==="
|
||||
wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
|
||||
fi
|
||||
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
|
||||
|
||||
cd "${BUILD_TMP}"
|
||||
tar xf "${SRC_TAR}"
|
||||
SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
|
||||
[ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
|
||||
|
||||
cd "${SRC_DIR}/src"
|
||||
echo "=== configuring john ==="
|
||||
./configure
|
||||
echo "=== building john ==="
|
||||
make clean >/dev/null 2>&1 || true
|
||||
make -j"$(nproc)"
|
||||
|
||||
mkdir -p "${CACHE_DIR}"
|
||||
cp -a "../run" "${CACHE_DIR}/run"
|
||||
chmod +x "${CACHE_DIR}/run/john"
|
||||
|
||||
echo "=== john build complete ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
@@ -10,7 +10,7 @@
|
||||
# Output layout:
|
||||
# $CACHE_DIR/modules/ — nvidia*.ko files
|
||||
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs
|
||||
|
||||
set -e
|
||||
|
||||
@@ -133,7 +133,14 @@ fi
|
||||
# Copy ALL userspace library files.
|
||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
count=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||
@@ -150,7 +157,14 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
|
||||
@@ -183,7 +183,7 @@ else
|
||||
fi
|
||||
|
||||
# --- NVIDIA-only build steps ---
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
@@ -196,20 +196,20 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
echo "=== building bee-gpu-burn worker ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
-o "$GPU_BURN_WORKER_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl -lm
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
echo "binary: $GPU_BURN_WORKER_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -246,6 +246,9 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
|
||||
@@ -293,9 +296,13 @@ mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee" "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${GPU_BURN_WORKER_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
|
||||
ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
fi
|
||||
|
||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||
@@ -334,6 +341,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors"
|
||||
printf 'libnvidia-opencl.so.1\n' > "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors/nvidia.icd"
|
||||
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
@@ -353,7 +362,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by the bee-gpu-burn worker tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
@@ -372,6 +381,16 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
echo ""
|
||||
echo "=== building john jumbo ${JOHN_JUMBO_COMMIT} ==="
|
||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/john"
|
||||
echo "=== john injected ==="
|
||||
fi
|
||||
|
||||
# --- embed build metadata ---
|
||||
@@ -385,7 +404,8 @@ NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||
|
||||
@@ -60,6 +60,8 @@ chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-john-gpu-stress 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Reload udev rules
|
||||
|
||||
@@ -1,2 +1,8 @@
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
|
||||
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
ocl-icd-libopencl1
|
||||
clinfo
|
||||
|
||||
Reference in New Issue
Block a user