Compare commits

...

4 Commits
v8.8 ... v8.9

Author SHA1 Message Date
Mikhail Chusavitin
cd9e2cbe13 Fix ramp-up power bench: one task instead of N redundant tasks
RunNvidiaPowerBench already performs a full internal ramp from 1 to N
GPUs in Phase 2. Spawning N tasks with growing GPU subsets meant task K
repeated all steps 1..K-1 already done by tasks 1..K-1 — O(N²) work
instead of O(N). Replace with a single task using all selected GPUs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 12:29:11 +03:00
Mikhail Chusavitin
0317dc58fd Fix memtest hook: grub.cfg/live.cfg missing during binary hooks is expected
lb binary_grub-efi and lb binary_syslinux create these files from templates
that already have memtest entries hardcoded. The hook should not fail when
the files don't exist yet — validate_iso_memtest() checks the final ISO.
Only the binary files (x64.bin, x64.efi) are required here.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 10:33:22 +03:00
Mikhail Chusavitin
1c5cb45698 Fix memtest hook: bad ver_arg format in apt-get download
ver_arg was set to "=memtest86+=VERSION" making the command
"apt-get download memtest86+=memtest86+=VERSION" (invalid).
Fixed to build pkg_spec directly as "memtest86+=VERSION".
Also add apt-get update retry if initial download fails.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 10:15:01 +03:00
Mikhail Chusavitin
090b92ca73 Re-enable security repo: kernel 6.1.0-44 is in bookworm-security only
Disabling --security broke the build because linux-image-6.1.0-44-amd64
is a security update not present in the base bookworm repo.
Main packages already come from mirror.mephi.ru.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 10:02:52 +03:00
3 changed files with 43 additions and 39 deletions

View File

@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
}
if rampUp && len(body.GPUIndices) > 1 {
// Ramp-up mode: resolve GPU list, then create one task per prefix
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
// in Phase 2 (one additional GPU per step). A single task with all
// selected GPUs is sufficient — spawning N tasks with growing subsets
// would repeat all earlier steps redundantly.
gpus, err := apiListNvidiaGPUs(h.opts.App)
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
} else {
now := time.Now()
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
var allTasks []*Task
for step := 1; step <= len(resolved); step++ {
subset := resolved[:step]
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
t := &Task{
ID: newJobID("bee-bench-nvidia"),
Name: stepName,
Target: target,
Priority: defaultTaskPriority(target, taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
GPUIndices: append([]int(nil), subset...),
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL && step == len(resolved),
ParallelGPUs: true,
RampStep: step,
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: stepName,
},
}
allTasks = append(allTasks, t)
taskName := fmt.Sprintf("%s · ramp 1%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
t := &Task{
ID: newJobID("bee-bench-nvidia"),
Name: taskName,
Target: target,
Priority: defaultTaskPriority(target, taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
GPUIndices: append([]int(nil), resolved...),
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
ParallelGPUs: true,
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: taskName,
},
}
for _, t := range allTasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, allTasks)
globalQueue.enqueue(t)
writeTaskRunResponse(w, []*Task{t})
return
}
}

View File

@@ -26,7 +26,7 @@ lb config noauto \
--mirror-bootstrap "http://mirror.mephi.ru/debian/" \
--mirror-chroot "http://mirror.mephi.ru/debian/" \
--mirror-binary "http://mirror.mephi.ru/debian/" \
--security false \
--security true \
--linux-flavours "amd64" \
--linux-packages "${LB_LINUX_PACKAGES}" \
--memtest memtest86+ \

View File

@@ -26,6 +26,14 @@ fail_or_warn() {
return 0
}
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
# The template already has memtest entries hardcoded, so a missing config file
# here is not an error; validate_iso_memtest() checks the final ISO instead.
warn_only() {
log "WARNING: $1"
}
copy_memtest_file() {
src="$1"
dst_name="${2:-$(basename "$src")}"
@@ -61,15 +69,17 @@ extract_memtest_from_deb() {
download_and_extract_memtest() {
tmpdl="$(mktemp -d)"
ver_arg=""
if [ -n "${MEMTEST_VERSION:-}" ]; then
ver_arg="=memtest86+=${MEMTEST_VERSION}"
log "downloading memtest86+=${MEMTEST_VERSION} from apt"
pkg_spec="memtest86+=${MEMTEST_VERSION}"
else
log "downloading memtest86+ from apt (no version pinned)"
pkg_spec="memtest86+"
fi
log "downloading ${pkg_spec} from apt"
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
log "apt download failed, retrying after apt-get update"
apt-get update -qq >/dev/null 2>&1 || true
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
fi
# shellcheck disable=SC2086
( cd "$tmpdl" && apt-get download "memtest86+${ver_arg}" ) 2>/dev/null || true
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
if [ -n "$deb" ]; then
extract_memtest_from_deb "$deb"
@@ -133,7 +143,7 @@ ensure_memtest_binaries() {
ensure_grub_entry() {
[ -f "$GRUB_CFG" ] || {
fail_or_warn "missing ${GRUB_CFG}"
warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
return 0
}
@@ -159,7 +169,7 @@ EOF
ensure_isolinux_entry() {
[ -f "$ISOLINUX_CFG" ] || {
fail_or_warn "missing ${ISOLINUX_CFG}"
warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
return 0
}