Add staged NVIDIA burn ramp-up mode
This commit is contained in:
@@ -117,7 +117,7 @@ type satRunner interface {
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
@@ -566,11 +566,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
|
||||
@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaComputeFn != nil {
|
||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
|
||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
|
||||
@@ -384,25 +384,39 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
var (
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if staggerSec > 0 && len(selected) > 1 {
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: nvidiaVisibleDevicesEnv(selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: profEnv,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
StaggerSeconds int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
|
||||
@@ -482,12 +482,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
Loader string `json:"loader"`
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
PlatformComponents []string `json:"platform_components"`
|
||||
@@ -503,12 +504,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
StaggerGPUStart: body.StaggerGPUStart,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
|
||||
@@ -2117,12 +2117,16 @@ func renderBurn() string {
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
<label class="cb-row" style="margin-top:10px">
|
||||
<input type="checkbox" id="burn-stagger-nvidia">
|
||||
<span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">Core Burn Paths</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
@@ -2196,6 +2200,11 @@ function burnSelectedGPUIndices() {
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
|
||||
function burnUseNvidiaRampUp() {
|
||||
const el = document.getElementById('burn-stagger-nvidia');
|
||||
return !!(el && el.checked);
|
||||
}
|
||||
|
||||
function burnUpdateSelectionNote() {
|
||||
const note = document.getElementById('burn-selection-note');
|
||||
const selected = burnSelectedGPUIndices();
|
||||
@@ -2255,6 +2264,9 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
body.gpu_indices = selected;
|
||||
if (burnUseNvidiaRampUp() && selected.length > 1) {
|
||||
body.stagger_gpu_start = true;
|
||||
}
|
||||
}
|
||||
return fetch('/api/sat/' + target + '/run', {
|
||||
method: 'POST',
|
||||
|
||||
@@ -118,6 +118,7 @@ type taskParams struct {
|
||||
StressMode bool `json:"stress_mode,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
|
||||
SizeMB int `json:"size_mb,omitempty"`
|
||||
Passes int `json:"passes,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
@@ -162,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
|
||||
}
|
||||
}
|
||||
|
||||
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
|
||||
if enabled && len(selected) > 1 {
|
||||
return 180
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
acceptanceCycles := []platform.PlatformStressCycle{
|
||||
{LoadSec: 85, IdleSec: 5},
|
||||
@@ -592,7 +600,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
@@ -601,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if staggerSec > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
|
||||
case "nvidia-targeted-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -651,12 +663,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
}, j.append)
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
|
||||
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
@@ -0,0 +1,110 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=300
|
||||
STAGGER_SECONDS=180
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
resolve_dcgmproftester() {
|
||||
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||
if command -v "${candidate}" >/dev/null 2>&1; then
|
||||
command -v "${candidate}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
echo "loader=dcgmproftester-staggered"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
gpu_pos=0
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_pos=$((gpu_pos + 1))
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||
echo "starting gpu ${id} seconds=${gpu_seconds}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
|
||||
status=0
|
||||
for spec in ${WORKERS}; do
|
||||
pid=${spec%%:*}
|
||||
rest=${spec#*:}
|
||||
id=${rest%%:*}
|
||||
log=${rest#*:}
|
||||
if wait "${pid}"; then
|
||||
echo "gpu ${id} finished: OK"
|
||||
else
|
||||
rc=$?
|
||||
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||
status=1
|
||||
fi
|
||||
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||
done
|
||||
|
||||
exit "${status}"
|
||||
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
@@ -2,13 +2,14 @@
|
||||
set -eu
|
||||
|
||||
SECONDS=5
|
||||
STAGGER_SECONDS=0
|
||||
SIZE_MB=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -25,6 +26,7 @@ contains_csv() {
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
@@ -61,14 +63,18 @@ done
|
||||
|
||||
echo "loader=bee-gpu-burn"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
gpu_pos=0
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_pos=$((gpu_pos + 1))
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
gpu_size_mb="${SIZE_MB}"
|
||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_size_mb=512
|
||||
fi
|
||||
fi
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" \
|
||||
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
|
||||
status=0
|
||||
|
||||
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
@@ -2,6 +2,7 @@
|
||||
set -eu
|
||||
|
||||
DURATION_SEC=300
|
||||
STAGGER_SECONDS=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
FORMAT=""
|
||||
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||
@@ -170,6 +172,7 @@ done
|
||||
echo "loader=john"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "john_devices=${JOHN_DEVICES}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
cd "${JOHN_DIR}"
|
||||
|
||||
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
echo "target_seconds=${DURATION_SEC}"
|
||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
||||
TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
_first=1
|
||||
pos=0
|
||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||
pos=$((pos + 1))
|
||||
[ "${_first}" = "1" ] || sleep 3
|
||||
_first=0
|
||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
||||
extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
|
||||
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
|
||||
run_john_loop "${opencl_id}" "${deadline}" &
|
||||
pid=$!
|
||||
PIDS="${PIDS} ${pid}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
FAIL=0
|
||||
for pid in ${PIDS}; do
|
||||
|
||||
Reference in New Issue
Block a user