Add staged NVIDIA burn ramp-up mode

This commit is contained in:
Mikhail Chusavitin
2026-04-09 15:21:14 +03:00
parent a78fdadd88
commit 9481ca2805
11 changed files with 226 additions and 47 deletions

View File

@@ -117,7 +117,7 @@ type satRunner interface {
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
@@ -566,11 +566,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
} }
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir
} }
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc) return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
} }
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {

View File

@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
return f.runNvidiaFn(baseDir) return f.runNvidiaFn(baseDir)
} }
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
if f.runNvidiaComputeFn != nil { if f.runNvidiaComputeFn != nil {
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices) return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
} }

View File

@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
"--seconds", strconv.Itoa(opts.DurationSec), "--seconds", strconv.Itoa(opts.DurationSec),
"--size-mb", strconv.Itoa(opts.SizeMB), "--size-mb", strconv.Itoa(opts.SizeMB),
} }
if opts.StaggerSeconds > 0 && len(selected) > 1 {
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
}
if len(selected) > 0 { if len(selected) > 0 {
cmd = append(cmd, "--devices", joinIndexList(selected)) cmd = append(cmd, "--devices", joinIndexList(selected))
} }
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
"bee-john-gpu-stress", "bee-john-gpu-stress",
"--seconds", strconv.Itoa(opts.DurationSec), "--seconds", strconv.Itoa(opts.DurationSec),
} }
if opts.StaggerSeconds > 0 && len(selected) > 1 {
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
}
if len(selected) > 0 { if len(selected) > 0 {
cmd = append(cmd, "--devices", joinIndexList(selected)) cmd = append(cmd, "--devices", joinIndexList(selected))
} }

View File

@@ -384,22 +384,36 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
), logFunc) ), logFunc)
} }
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices) selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil { if err != nil {
return "", err return "", err
} }
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec))) var (
profCmd []string
profEnv []string
)
if staggerSec > 0 && len(selected) > 1 {
profCmd = []string{
"bee-dcgmproftester-staggered",
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
"--stagger-seconds", strconv.Itoa(staggerSec),
"--devices", joinIndexList(selected),
}
} else {
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
if err != nil { if err != nil {
return "", err return "", err
} }
profEnv = nvidiaVisibleDevicesEnv(selected)
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode( return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
satJob{ satJob{
name: "03-dcgmproftester.log", name: "03-dcgmproftester.log",
cmd: profCmd, cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected), env: profEnv,
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },

View File

@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
Loader string Loader string
GPUIndices []int GPUIndices []int
ExcludeGPUIndices []int ExcludeGPUIndices []int
StaggerSeconds int
} }
func New() *System { func New() *System {

View File

@@ -487,6 +487,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
StressMode bool `json:"stress_mode"` StressMode bool `json:"stress_mode"`
GPUIndices []int `json:"gpu_indices"` GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"` ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
StaggerGPUStart bool `json:"stagger_gpu_start"`
Loader string `json:"loader"` Loader string `json:"loader"`
Profile string `json:"profile"` Profile string `json:"profile"`
DisplayName string `json:"display_name"` DisplayName string `json:"display_name"`
@@ -508,6 +509,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
StressMode: body.StressMode, StressMode: body.StressMode,
GPUIndices: body.GPUIndices, GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices, ExcludeGPUIndices: body.ExcludeGPUIndices,
StaggerGPUStart: body.StaggerGPUStart,
Loader: body.Loader, Loader: body.Loader,
BurnProfile: body.Profile, BurnProfile: body.Profile,
DisplayName: body.DisplayName, DisplayName: body.DisplayName,

View File

@@ -2121,8 +2121,12 @@ func renderBurn() string {
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p> <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
</div> </div>
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p> <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
<label class="cb-row" style="margin-top:10px">
<input type="checkbox" id="burn-stagger-nvidia">
<span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span>
</label>
</div>
</div> </div>
</div>
<div class="burn-section">Core Burn Paths</div> <div class="burn-section">Core Burn Paths</div>
<div class="grid2 burn-grid" style="margin-bottom:16px"> <div class="grid2 burn-grid" style="margin-bottom:16px">
@@ -2196,6 +2200,11 @@ function burnSelectedGPUIndices() {
.sort(function(a, b) { return a - b; }); .sort(function(a, b) { return a - b; });
} }
function burnUseNvidiaRampUp() {
const el = document.getElementById('burn-stagger-nvidia');
return !!(el && el.checked);
}
function burnUpdateSelectionNote() { function burnUpdateSelectionNote() {
const note = document.getElementById('burn-selection-note'); const note = document.getElementById('burn-selection-note');
const selected = burnSelectedGPUIndices(); const selected = burnSelectedGPUIndices();
@@ -2255,6 +2264,9 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
return Promise.reject(new Error('Select at least one NVIDIA GPU.')); return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
} }
body.gpu_indices = selected; body.gpu_indices = selected;
if (burnUseNvidiaRampUp() && selected.length > 1) {
body.stagger_gpu_start = true;
}
} }
return fetch('/api/sat/' + target + '/run', { return fetch('/api/sat/' + target + '/run', {
method: 'POST', method: 'POST',

View File

@@ -118,6 +118,7 @@ type taskParams struct {
StressMode bool `json:"stress_mode,omitempty"` StressMode bool `json:"stress_mode,omitempty"`
GPUIndices []int `json:"gpu_indices,omitempty"` GPUIndices []int `json:"gpu_indices,omitempty"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"` ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
SizeMB int `json:"size_mb,omitempty"` SizeMB int `json:"size_mb,omitempty"`
Passes int `json:"passes,omitempty"` Passes int `json:"passes,omitempty"`
Loader string `json:"loader,omitempty"` Loader string `json:"loader,omitempty"`
@@ -162,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
} }
} }
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
if enabled && len(selected) > 1 {
return 180
}
return 0
}
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions { func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
acceptanceCycles := []platform.PlatformStressCycle{ acceptanceCycles := []platform.PlatformStressCycle{
{LoadSec: 85, IdleSec: 5}, {LoadSec: 85, IdleSec: 5},
@@ -601,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if t.params.BurnProfile != "" && dur <= 0 { if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
} }
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append) staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
if staggerSec > 0 {
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
case "nvidia-targeted-power": case "nvidia-targeted-power":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
@@ -656,6 +668,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
Loader: t.params.Loader, Loader: t.params.Loader,
GPUIndices: t.params.GPUIndices, GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices, ExcludeGPUIndices: t.params.ExcludeGPUIndices,
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
}, j.append) }, j.append)
case "memory": case "memory":
if a == nil { if a == nil {

View File

@@ -0,0 +1,110 @@
#!/bin/sh
set -eu
SECONDS=300
STAGGER_SECONDS=180
DEVICES=""
EXCLUDE=""
usage() {
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
exit 2
}
normalize_list() {
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
}
contains_csv() {
needle="$1"
haystack="${2:-}"
echo ",${haystack}," | grep -q ",${needle},"
}
resolve_dcgmproftester() {
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
if command -v "${candidate}" >/dev/null 2>&1; then
command -v "${candidate}"
return 0
fi
done
return 1
}
while [ "$#" -gt 0 ]; do
case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
*) usage ;;
esac
done
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
DEVICES=$(normalize_list "${DEVICES}")
EXCLUDE=$(normalize_list "${EXCLUDE}")
SELECTED="${DEVICES}"
if [ -z "${SELECTED}" ]; then
SELECTED="${ALL_DEVICES}"
fi
FINAL=""
for id in $(echo "${SELECTED}" | tr ',' ' '); do
[ -n "${id}" ] || continue
if contains_csv "${id}" "${EXCLUDE}"; then
continue
fi
if [ -z "${FINAL}" ]; then
FINAL="${id}"
else
FINAL="${FINAL},${id}"
fi
done
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
echo "loader=dcgmproftester-staggered"
echo "selected_gpus=${FINAL}"
echo "stagger_seconds=${STAGGER_SECONDS}"
TMP_DIR=$(mktemp -d)
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
gpu_pos=0
WORKERS=""
for id in $(echo "${FINAL}" | tr ',' ' '); do
gpu_pos=$((gpu_pos + 1))
log="${TMP_DIR}/gpu-${id}.log"
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
gpu_seconds=$(( SECONDS + extra_sec ))
echo "starting gpu ${id} seconds=${gpu_seconds}"
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
pid=$!
WORKERS="${WORKERS} ${pid}:${id}:${log}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
sleep "${STAGGER_SECONDS}"
fi
done
status=0
for spec in ${WORKERS}; do
pid=${spec%%:*}
rest=${spec#*:}
id=${rest%%:*}
log=${rest#*:}
if wait "${pid}"; then
echo "gpu ${id} finished: OK"
else
rc=$?
echo "gpu ${id} finished: FAILED rc=${rc}"
status=1
fi
sed "s/^/[gpu ${id}] /" "${log}" || true
done
exit "${status}"

17
iso/overlay/usr/local/bin/bee-gpu-burn Normal file → Executable file
View File

@@ -2,13 +2,14 @@
set -eu set -eu
SECONDS=5 SECONDS=5
STAGGER_SECONDS=0
SIZE_MB=0 SIZE_MB=0
DEVICES="" DEVICES=""
EXCLUDE="" EXCLUDE=""
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker" WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
usage() { usage() {
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2 echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
exit 2 exit 2
} }
@@ -25,6 +26,7 @@ contains_csv() {
while [ "$#" -gt 0 ]; do while [ "$#" -gt 0 ]; do
case "$1" in case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;; --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;; --size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;; --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;; --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
@@ -61,14 +63,18 @@ done
echo "loader=bee-gpu-burn" echo "loader=bee-gpu-burn"
echo "selected_gpus=${FINAL}" echo "selected_gpus=${FINAL}"
echo "stagger_seconds=${STAGGER_SECONDS}"
export CUDA_DEVICE_ORDER="PCI_BUS_ID" export CUDA_DEVICE_ORDER="PCI_BUS_ID"
TMP_DIR=$(mktemp -d) TMP_DIR=$(mktemp -d)
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
gpu_pos=0
WORKERS="" WORKERS=""
for id in $(echo "${FINAL}" | tr ',' ' '); do for id in $(echo "${FINAL}" | tr ',' ' '); do
gpu_pos=$((gpu_pos + 1))
log="${TMP_DIR}/gpu-${id}.log" log="${TMP_DIR}/gpu-${id}.log"
gpu_size_mb="${SIZE_MB}" gpu_size_mb="${SIZE_MB}"
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
gpu_size_mb=512 gpu_size_mb=512
fi fi
fi fi
echo "starting gpu ${id} size=${gpu_size_mb}MB" extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
gpu_seconds=$(( SECONDS + extra_sec ))
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
CUDA_VISIBLE_DEVICES="${id}" \ CUDA_VISIBLE_DEVICES="${id}" \
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 & "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
pid=$! pid=$!
WORKERS="${WORKERS} ${pid}:${id}:${log}" WORKERS="${WORKERS} ${pid}:${id}:${log}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
sleep "${STAGGER_SECONDS}"
fi
done done
status=0 status=0

16
iso/overlay/usr/local/bin/bee-john-gpu-stress Normal file → Executable file
View File

@@ -2,6 +2,7 @@
set -eu set -eu
DURATION_SEC=300 DURATION_SEC=300
STAGGER_SECONDS=0
DEVICES="" DEVICES=""
EXCLUDE="" EXCLUDE=""
FORMAT="" FORMAT=""
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
usage() { usage() {
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2 echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
exit 2 exit 2
} }
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
while [ "$#" -gt 0 ]; do while [ "$#" -gt 0 ]; do
case "$1" in case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;; --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;; --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;; --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;; --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -170,6 +172,7 @@ done
echo "loader=john" echo "loader=john"
echo "selected_gpus=${FINAL}" echo "selected_gpus=${FINAL}"
echo "john_devices=${JOHN_DEVICES}" echo "john_devices=${JOHN_DEVICES}"
echo "stagger_seconds=${STAGGER_SECONDS}"
cd "${JOHN_DIR}" cd "${JOHN_DIR}"
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
echo "format=${CHOSEN_FORMAT}" echo "format=${CHOSEN_FORMAT}"
echo "target_seconds=${DURATION_SEC}" echo "target_seconds=${DURATION_SEC}"
echo "slice_seconds=${TEST_SLICE_SECONDS}" echo "slice_seconds=${TEST_SLICE_SECONDS}"
DEADLINE=$(( $(date +%s) + DURATION_SEC )) TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
_first=1 _first=1
pos=0
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
pos=$((pos + 1))
[ "${_first}" = "1" ] || sleep 3 [ "${_first}" = "1" ] || sleep 3
_first=0 _first=0
run_john_loop "${opencl_id}" "${DEADLINE}" & extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
run_john_loop "${opencl_id}" "${deadline}" &
pid=$! pid=$!
PIDS="${PIDS} ${pid}" PIDS="${PIDS} ${pid}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
sleep "${STAGGER_SECONDS}"
fi
done done
FAIL=0 FAIL=0
for pid in ${PIDS}; do for pid in ${PIDS}; do