#!/bin/sh
set -eu

SECONDS=300
STAGGER_SECONDS=180
DEVICES=""
EXCLUDE=""

usage() {
    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
    exit 2
}

normalize_list() {
    echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
}

contains_csv() {
    needle="$1"
    haystack="${2:-}"
    echo ",${haystack}," | grep -q ",${needle},"
}

resolve_dcgmproftester() {
    for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
        if command -v "${candidate}" >/dev/null 2>&1; then
            command -v "${candidate}"
            return 0
        fi
    done
    return 1
}

while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
        --stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        *) usage ;;
    esac
done

PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }

DEVICES=$(normalize_list "${DEVICES}")
EXCLUDE=$(normalize_list "${EXCLUDE}")
SELECTED="${DEVICES}"
if [ -z "${SELECTED}" ]; then
    SELECTED="${ALL_DEVICES}"
fi

FINAL=""
for id in $(echo "${SELECTED}" | tr ',' ' '); do
    [ -n "${id}" ] || continue
    if contains_csv "${id}" "${EXCLUDE}"; then
        continue
    fi
    if [ -z "${FINAL}" ]; then
        FINAL="${id}"
    else
        FINAL="${FINAL},${id}"
    fi
done

[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }

echo "loader=dcgmproftester-staggered"
echo "selected_gpus=${FINAL}"
echo "stagger_seconds=${STAGGER_SECONDS}"

TMP_DIR=$(mktemp -d)
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM

GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
gpu_pos=0
WORKERS=""
for id in $(echo "${FINAL}" | tr ',' ' '); do
    gpu_pos=$((gpu_pos + 1))
    log="${TMP_DIR}/gpu-${id}.log"
    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
    gpu_seconds=$(( SECONDS + extra_sec ))
    echo "starting gpu ${id} seconds=${gpu_seconds}"
    CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
        sleep "${STAGGER_SECONDS}"
    fi
done

status=0
for spec in ${WORKERS}; do
    pid=${spec%%:*}
    rest=${spec#*:}
    id=${rest%%:*}
    log=${rest#*:}
    if wait "${pid}"; then
        echo "gpu ${id} finished: OK"
    else
        rc=$?
        echo "gpu ${id} finished: FAILED rc=${rc}"
        status=1
    fi
    sed "s/^/[gpu ${id}] /" "${log}" || true
done

exit "${status}"
