Compare commits

..

3 Commits
v8.8.1 ... v8.9

Author SHA1 Message Date
Mikhail Chusavitin
cd9e2cbe13 Fix ramp-up power bench: one task instead of N redundant tasks
RunNvidiaPowerBench already performs a full internal ramp from 1 to N
GPUs in Phase 2. Spawning N tasks with growing GPU subsets meant task K
repeated all steps 1..K-1 already done by tasks 1..K-1 — O(N²) work
instead of O(N). Replace with a single task using all selected GPUs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 12:29:11 +03:00
Mikhail Chusavitin
0317dc58fd Fix memtest hook: grub.cfg/live.cfg missing during binary hooks is expected
lb binary_grub-efi and lb binary_syslinux create these files from templates
that already have memtest entries hardcoded. The hook should not fail when
the files don't exist yet — validate_iso_memtest() checks the final ISO.
Only the binary files (x64.bin, x64.efi) are required here.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 10:33:22 +03:00
Mikhail Chusavitin
1c5cb45698 Fix memtest hook: bad ver_arg format in apt-get download
ver_arg was set to "=memtest86+=VERSION" making the command
"apt-get download memtest86+=memtest86+=VERSION" (invalid).
Fixed to build pkg_spec directly as "memtest86+=VERSION".
Also add apt-get update retry if initial download fails.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 10:15:01 +03:00
2 changed files with 42 additions and 38 deletions

View File

@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
} }
if rampUp && len(body.GPUIndices) > 1 { if rampUp && len(body.GPUIndices) > 1 {
// Ramp-up mode: resolve GPU list, then create one task per prefix // Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel. // in Phase 2 (one additional GPU per step). A single task with all
// selected GPUs is sufficient — spawning N tasks with growing subsets
// would repeat all earlier steps redundantly.
gpus, err := apiListNvidiaGPUs(h.opts.App) gpus, err := apiListNvidiaGPUs(h.opts.App)
if err != nil { if err != nil {
writeError(w, http.StatusBadRequest, err.Error()) writeError(w, http.StatusBadRequest, err.Error())
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
} else { } else {
now := time.Now() now := time.Now()
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405")) rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
var allTasks []*Task taskName := fmt.Sprintf("%s · ramp 1%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
for step := 1; step <= len(resolved); step++ { t := &Task{
subset := resolved[:step] ID: newJobID("bee-bench-nvidia"),
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset)) Name: taskName,
t := &Task{ Target: target,
ID: newJobID("bee-bench-nvidia"), Priority: defaultTaskPriority(target, taskParams{}),
Name: stepName, Status: TaskPending,
Target: target, CreatedAt: now,
Priority: defaultTaskPriority(target, taskParams{}), params: taskParams{
Status: TaskPending, GPUIndices: append([]int(nil), resolved...),
CreatedAt: now, SizeMB: body.SizeMB,
params: taskParams{ BenchmarkProfile: body.Profile,
GPUIndices: append([]int(nil), subset...), RunNCCL: runNCCL,
SizeMB: body.SizeMB, ParallelGPUs: true,
BenchmarkProfile: body.Profile, RampTotal: len(resolved),
RunNCCL: runNCCL && step == len(resolved), RampRunID: rampRunID,
ParallelGPUs: true, DisplayName: taskName,
RampStep: step, },
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: stepName,
},
}
allTasks = append(allTasks, t)
} }
for _, t := range allTasks { globalQueue.enqueue(t)
globalQueue.enqueue(t) writeTaskRunResponse(w, []*Task{t})
}
writeTaskRunResponse(w, allTasks)
return return
} }
} }

View File

@@ -26,6 +26,14 @@ fail_or_warn() {
return 0 return 0
} }
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
# The template already has memtest entries hardcoded, so a missing config file
# here is not an error; validate_iso_memtest() checks the final ISO instead.
warn_only() {
log "WARNING: $1"
}
copy_memtest_file() { copy_memtest_file() {
src="$1" src="$1"
dst_name="${2:-$(basename "$src")}" dst_name="${2:-$(basename "$src")}"
@@ -61,15 +69,17 @@ extract_memtest_from_deb() {
download_and_extract_memtest() { download_and_extract_memtest() {
tmpdl="$(mktemp -d)" tmpdl="$(mktemp -d)"
ver_arg=""
if [ -n "${MEMTEST_VERSION:-}" ]; then if [ -n "${MEMTEST_VERSION:-}" ]; then
ver_arg="=memtest86+=${MEMTEST_VERSION}" pkg_spec="memtest86+=${MEMTEST_VERSION}"
log "downloading memtest86+=${MEMTEST_VERSION} from apt"
else else
log "downloading memtest86+ from apt (no version pinned)" pkg_spec="memtest86+"
fi
log "downloading ${pkg_spec} from apt"
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
log "apt download failed, retrying after apt-get update"
apt-get update -qq >/dev/null 2>&1 || true
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
fi fi
# shellcheck disable=SC2086
( cd "$tmpdl" && apt-get download "memtest86+${ver_arg}" ) 2>/dev/null || true
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)" deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
if [ -n "$deb" ]; then if [ -n "$deb" ]; then
extract_memtest_from_deb "$deb" extract_memtest_from_deb "$deb"
@@ -133,7 +143,7 @@ ensure_memtest_binaries() {
ensure_grub_entry() { ensure_grub_entry() {
[ -f "$GRUB_CFG" ] || { [ -f "$GRUB_CFG" ] || {
fail_or_warn "missing ${GRUB_CFG}" warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
return 0 return 0
} }
@@ -159,7 +169,7 @@ EOF
ensure_isolinux_entry() { ensure_isolinux_entry() {
[ -f "$ISOLINUX_CFG" ] || { [ -f "$ISOLINUX_CFG" ] || {
fail_or_warn "missing ${ISOLINUX_CFG}" warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
return 0 return 0
} }