94 lines
2.4 KiB
Bash
94 lines
2.4 KiB
Bash
#!/bin/sh
|
|
set -eu
|
|
|
|
SECONDS=300
|
|
DEVICES=""
|
|
EXCLUDE=""
|
|
MIN_BYTES="512M"
|
|
MAX_BYTES="4G"
|
|
FACTOR="2"
|
|
ITERS="20"
|
|
ALL_REDUCE_BIN="/usr/local/bin/all_reduce_perf"
|
|
|
|
usage() {
|
|
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
|
exit 2
|
|
}
|
|
|
|
normalize_list() {
|
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
|
}
|
|
|
|
contains_csv() {
|
|
needle="$1"
|
|
haystack="${2:-}"
|
|
echo ",${haystack}," | grep -q ",${needle},"
|
|
}
|
|
|
|
while [ "$#" -gt 0 ]; do
|
|
case "$1" in
|
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
|
*) usage ;;
|
|
esac
|
|
done
|
|
|
|
[ -x "${ALL_REDUCE_BIN}" ] || { echo "all_reduce_perf not found: ${ALL_REDUCE_BIN}" >&2; exit 1; }
|
|
|
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
|
|
|
DEVICES=$(normalize_list "${DEVICES}")
|
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
|
SELECTED="${DEVICES}"
|
|
if [ -z "${SELECTED}" ]; then
|
|
SELECTED="${ALL_DEVICES}"
|
|
fi
|
|
|
|
FINAL=""
|
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
|
[ -n "${id}" ] || continue
|
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
|
continue
|
|
fi
|
|
if [ -z "${FINAL}" ]; then
|
|
FINAL="${id}"
|
|
else
|
|
FINAL="${FINAL},${id}"
|
|
fi
|
|
done
|
|
|
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
|
|
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | awk '{print $1}')
|
|
[ "${GPU_COUNT}" -gt 0 ] || { echo "selected GPU count is zero" >&2; exit 1; }
|
|
|
|
echo "loader=nccl"
|
|
echo "selected_gpus=${FINAL}"
|
|
echo "gpu_count=${GPU_COUNT}"
|
|
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
|
echo "iters=${ITERS}"
|
|
|
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
|
|
|
deadline=$(( $(date +%s) + SECONDS ))
|
|
round=0
|
|
|
|
while :; do
|
|
now=$(date +%s)
|
|
if [ "${now}" -ge "${deadline}" ]; then
|
|
break
|
|
fi
|
|
round=$((round + 1))
|
|
remaining=$((deadline - now))
|
|
echo "round=${round} remaining_sec=${remaining}"
|
|
CUDA_VISIBLE_DEVICES="${FINAL}" \
|
|
"${ALL_REDUCE_BIN}" \
|
|
-b "${MIN_BYTES}" \
|
|
-e "${MAX_BYTES}" \
|
|
-f "${FACTOR}" \
|
|
-g "${GPU_COUNT}" \
|
|
--iters "${ITERS}"
|
|
done
|