Add staged NVIDIA burn ramp-up mode

This commit is contained in:
Mikhail Chusavitin
2026-04-09 15:21:14 +03:00
parent a78fdadd88
commit 9481ca2805
11 changed files with 226 additions and 47 deletions
+14 -3
View File
@@ -2,13 +2,14 @@
set -eu
SECONDS=5
STAGGER_SECONDS=0
SIZE_MB=0
DEVICES=""
EXCLUDE=""
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
usage() {
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
exit 2
}
@@ -25,6 +26,7 @@ contains_csv() {
while [ "$#" -gt 0 ]; do
case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
@@ -61,14 +63,18 @@ done
echo "loader=bee-gpu-burn"
echo "selected_gpus=${FINAL}"
echo "stagger_seconds=${STAGGER_SECONDS}"
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
TMP_DIR=$(mktemp -d)
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
gpu_pos=0
WORKERS=""
for id in $(echo "${FINAL}" | tr ',' ' '); do
gpu_pos=$((gpu_pos + 1))
log="${TMP_DIR}/gpu-${id}.log"
gpu_size_mb="${SIZE_MB}"
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
gpu_size_mb=512
fi
fi
echo "starting gpu ${id} size=${gpu_size_mb}MB"
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
gpu_seconds=$(( SECONDS + extra_sec ))
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
CUDA_VISIBLE_DEVICES="${id}" \
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
pid=$!
WORKERS="${WORKERS} ${pid}:${id}:${log}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
sleep "${STAGGER_SECONDS}"
fi
done
status=0