Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 |
@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
|||||||
}
|
}
|
||||||
|
|
||||||
if rampUp && len(body.GPUIndices) > 1 {
|
if rampUp && len(body.GPUIndices) > 1 {
|
||||||
// Ramp-up mode: resolve GPU list, then create one task per prefix
|
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
|
||||||
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
|
// in Phase 2 (one additional GPU per step). A single task with all
|
||||||
|
// selected GPUs is sufficient — spawning N tasks with growing subsets
|
||||||
|
// would repeat all earlier steps redundantly.
|
||||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusBadRequest, err.Error())
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
|||||||
} else {
|
} else {
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||||
var allTasks []*Task
|
taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
|
||||||
for step := 1; step <= len(resolved); step++ {
|
t := &Task{
|
||||||
subset := resolved[:step]
|
ID: newJobID("bee-bench-nvidia"),
|
||||||
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
|
Name: taskName,
|
||||||
t := &Task{
|
Target: target,
|
||||||
ID: newJobID("bee-bench-nvidia"),
|
Priority: defaultTaskPriority(target, taskParams{}),
|
||||||
Name: stepName,
|
Status: TaskPending,
|
||||||
Target: target,
|
CreatedAt: now,
|
||||||
Priority: defaultTaskPriority(target, taskParams{}),
|
params: taskParams{
|
||||||
Status: TaskPending,
|
GPUIndices: append([]int(nil), resolved...),
|
||||||
CreatedAt: now,
|
SizeMB: body.SizeMB,
|
||||||
params: taskParams{
|
BenchmarkProfile: body.Profile,
|
||||||
GPUIndices: append([]int(nil), subset...),
|
RunNCCL: runNCCL,
|
||||||
SizeMB: body.SizeMB,
|
ParallelGPUs: true,
|
||||||
BenchmarkProfile: body.Profile,
|
RampTotal: len(resolved),
|
||||||
RunNCCL: runNCCL && step == len(resolved),
|
RampRunID: rampRunID,
|
||||||
ParallelGPUs: true,
|
DisplayName: taskName,
|
||||||
RampStep: step,
|
},
|
||||||
RampTotal: len(resolved),
|
|
||||||
RampRunID: rampRunID,
|
|
||||||
DisplayName: stepName,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
allTasks = append(allTasks, t)
|
|
||||||
}
|
}
|
||||||
for _, t := range allTasks {
|
globalQueue.enqueue(t)
|
||||||
globalQueue.enqueue(t)
|
writeTaskRunResponse(w, []*Task{t})
|
||||||
}
|
|
||||||
writeTaskRunResponse(w, allTasks)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,14 @@ fail_or_warn() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
|
||||||
|
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
|
||||||
|
# The template already has memtest entries hardcoded, so a missing config file
|
||||||
|
# here is not an error; validate_iso_memtest() checks the final ISO instead.
|
||||||
|
warn_only() {
|
||||||
|
log "WARNING: $1"
|
||||||
|
}
|
||||||
|
|
||||||
copy_memtest_file() {
|
copy_memtest_file() {
|
||||||
src="$1"
|
src="$1"
|
||||||
dst_name="${2:-$(basename "$src")}"
|
dst_name="${2:-$(basename "$src")}"
|
||||||
@@ -61,15 +69,17 @@ extract_memtest_from_deb() {
|
|||||||
|
|
||||||
download_and_extract_memtest() {
|
download_and_extract_memtest() {
|
||||||
tmpdl="$(mktemp -d)"
|
tmpdl="$(mktemp -d)"
|
||||||
ver_arg=""
|
|
||||||
if [ -n "${MEMTEST_VERSION:-}" ]; then
|
if [ -n "${MEMTEST_VERSION:-}" ]; then
|
||||||
ver_arg="=memtest86+=${MEMTEST_VERSION}"
|
pkg_spec="memtest86+=${MEMTEST_VERSION}"
|
||||||
log "downloading memtest86+=${MEMTEST_VERSION} from apt"
|
|
||||||
else
|
else
|
||||||
log "downloading memtest86+ from apt (no version pinned)"
|
pkg_spec="memtest86+"
|
||||||
|
fi
|
||||||
|
log "downloading ${pkg_spec} from apt"
|
||||||
|
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
|
||||||
|
log "apt download failed, retrying after apt-get update"
|
||||||
|
apt-get update -qq >/dev/null 2>&1 || true
|
||||||
|
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
|
||||||
fi
|
fi
|
||||||
# shellcheck disable=SC2086
|
|
||||||
( cd "$tmpdl" && apt-get download "memtest86+${ver_arg}" ) 2>/dev/null || true
|
|
||||||
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||||
if [ -n "$deb" ]; then
|
if [ -n "$deb" ]; then
|
||||||
extract_memtest_from_deb "$deb"
|
extract_memtest_from_deb "$deb"
|
||||||
@@ -133,7 +143,7 @@ ensure_memtest_binaries() {
|
|||||||
|
|
||||||
ensure_grub_entry() {
|
ensure_grub_entry() {
|
||||||
[ -f "$GRUB_CFG" ] || {
|
[ -f "$GRUB_CFG" ] || {
|
||||||
fail_or_warn "missing ${GRUB_CFG}"
|
warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -159,7 +169,7 @@ EOF
|
|||||||
|
|
||||||
ensure_isolinux_entry() {
|
ensure_isolinux_entry() {
|
||||||
[ -f "$ISOLINUX_CFG" ] || {
|
[ -f "$ISOLINUX_CFG" ] || {
|
||||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user