Compare commits

..

8 Commits
v8.5 ... v8.8.1

Author SHA1 Message Date
Mikhail Chusavitin
090b92ca73 Re-enable security repo: kernel 6.1.0-44 is in bookworm-security only
Disabling --security broke the build because linux-image-6.1.0-44-amd64
is a security update not present in the base bookworm repo.
Main packages already come from mirror.mephi.ru.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 10:02:52 +03:00
Mikhail Chusavitin
2dccbc010c Use MEPHI mirror, disable security repo, fix memtest in ISO build
- Switch all lb mirrors to mirror.mephi.ru/debian/ for faster/reliable downloads
- Disable security repo (--security false) — not needed for LiveCD
- Pin MEMTEST_VERSION=6.10-4 in VERSIONS, export to hook environment
- Set BEE_REQUIRE_MEMTEST=1 in build-in-container.sh — missing memtest is now fatal
- Fix 9100-memtest.hook.binary: add apt-get download fallback when lb
  binary_memtest has already purged the package cache; handle both 5.x
  (memtest86+x64.bin) and 6.x (memtest86+.bin) BIOS binary naming

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 09:57:29 +03:00
e84c69d360 Fix optional step log dir missing after memtest recovery
mkdir -p LOG_DIR before writing the optional step log so that a race
with cleanup_build_log (EXIT trap archiving the log dir) does not cause
a "Directory nonexistent" error during lb binary_checksums / lb binary_iso.

Also downgrade apt-get update failure to a warning so a transient mirror
outage does not block kernel ABI auto-detection when the apt cache is warm.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 07:28:36 +03:00
c80a39e7ac Add power results table, fix benchmark results refresh, bound memtester
- Benchmark page now shows two result sections: Performance (scores) and
  Power / Thermal Fit (slot table). After any benchmark task completes
  the results section auto-refreshes via GET /api/benchmark/results
  without a full page reload.
- Power results table shows each GPU slot with nominal TDP, achieved
  stable power limit, and P95 observed power. Rows with derated cards
  are highlighted amber so under-performing slots stand out at a glance.
  Older runs are collapsed in a <details> summary.
- memtester is now wrapped with timeout(1) so a stuck memory controller
  cannot cause Validate Memory to hang indefinitely. Wall-clock limit is
  ~2.5 min per 100 MB per pass plus a 2-minute buffer.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 07:16:18 +03:00
a5e0261ff2 Refactor power ramp to use true single-card baselines
Phase 1 now calibrates each GPU individually (sequentially) so that
PowerRealizationPct reflects real degradation from neighbour thermals and
shared power rails. Previously the baseline came from an all-GPU-together
run, making realization always ≈100% at the final ramp step.

Ramp step 1 reuses single-card calibration results (no extra run); steps
2..N run targeted_power on the growing GPU subset with derating active.

Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper —
they were compensation for the old all-GPU calibration approach.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 23:47:57 +03:00
ee422ede3c Revert "Add raster Easy Bee branding assets"
This reverts commit d560b2fead.
2026-04-14 23:00:15 +03:00
d560b2fead Add raster Easy Bee branding assets 2026-04-14 22:39:25 +03:00
3cf2e9c9dc Run power calibration for all GPUs simultaneously
Previously each GPU was calibrated sequentially (one card fully done
before the next started), producing the staircase temperature pattern
seen on the graph.

Now all GPUs run together in a single dcgmi diag -r targeted_power
session per attempt. This means:
- All cards are under realistic thermal load at the same time.
- A single DCGM session handles the run — no resource-busy contention
  from concurrent dcgmi processes.
- Binary search state (lo/hi) is tracked independently per GPU; each
  card converges to its own highest stable power limit.
- Throttle counter polling covers all active GPUs in the shared ticker.
- Resource-busy exponential back-off is shared (one DCGM session).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:25:05 +03:00
11 changed files with 478 additions and 217 deletions

View File

@@ -2498,8 +2498,25 @@ func runBenchmarkPowerCalibration(
err error err error
} }
// gpuCalibState holds per-GPU binary search state during parallel calibration.
type gpuCalibState struct {
idx int
info benchmarkGPUInfo
originalLimitW int
appliedLimitW int
minLimitW int
lo int // highest verified-stable limit (assumed: minLimitW)
hi int // lowest verified-unstable limit (exclusive sentinel above start)
calib benchmarkPowerCalibrationResult
converged bool
}
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices)) results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
var restore []benchmarkRestoreAction var restore []benchmarkRestoreAction
// Initialise per-GPU state.
states := make([]*gpuCalibState, 0, len(gpuIndices))
for _, idx := range gpuIndices { for _, idx := range gpuIndices {
info := infoByIndex[idx] info := infoByIndex[idx]
originalLimitW := int(math.Round(info.PowerLimitW)) originalLimitW := int(math.Round(info.PowerLimitW))
@@ -2528,17 +2545,17 @@ func runBenchmarkPowerCalibration(
if minLimitW < calibSearchTolerance { if minLimitW < calibSearchTolerance {
minLimitW = calibSearchTolerance minLimitW = calibSearchTolerance
} }
s := &gpuCalibState{
calib := benchmarkPowerCalibrationResult{ idx: idx,
AppliedPowerLimitW: float64(appliedLimitW), info: info,
originalLimitW: originalLimitW,
appliedLimitW: appliedLimitW,
minLimitW: minLimitW,
lo: minLimitW,
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
} }
// Binary search bounds for finding the highest stable power limit. states = append(states, s)
// lo = highest verified-stable level (assumed: minLimitW).
// hi = lowest verified-unstable level (assumed: above the starting limit).
lo := minLimitW
hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
busyRetries := 0
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
if canDerate && originalLimitW > 0 { if canDerate && originalLimitW > 0 {
idxCopy := idx idxCopy := idx
orig := originalLimitW orig := originalLimitW
@@ -2549,200 +2566,243 @@ func runBenchmarkPowerCalibration(
}, },
}) })
} }
}
calibLoop: // Shared DCGM resource-busy back-off state (single diagnostic session).
busyRetries := 0
busyDelaySec := 1
sharedAttempt := 0
type sharedAttemptResult struct {
out []byte
rows []GPUMetricRow
err error
}
calibDone:
for {
// Collect non-converged GPUs.
var active []*gpuCalibState
for _, s := range states {
if !s.converged {
active = append(active, s)
}
}
if len(active) == 0 || ctx.Err() != nil {
break
}
sharedAttempt++
for _, s := range active {
s.calib.Attempts++
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
}
// Snapshot throttle counters for all active GPUs before the run.
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
for _, s := range active {
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
}
// Run targeted_power for ALL gpuIndices simultaneously so every card
// is under load during calibration — this reflects real server thermals.
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
attemptCtx, cancelAttempt := context.WithCancel(ctx)
doneCh := make(chan sharedAttemptResult, 1)
go func() {
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
}()
ticker := time.NewTicker(time.Second)
throttleReasons := make(map[int]string, len(active))
var ar sharedAttemptResult
attemptLoop:
for { for {
calib.Attempts++ select {
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec)) case ar = <-doneCh:
break attemptLoop
beforeThrottle, _ := queryThrottleCounters(idx) case <-ticker.C:
attemptCtx, cancel := context.WithCancel(ctx) // Poll throttle counters for each active GPU independently.
doneCh := make(chan calibrationAttemptResult, 1) for _, s := range active {
logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts) if throttleReasons[s.idx] != "" {
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx}) continue // already detected for this GPU
go func() { }
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc) after, err := queryThrottleCounters(s.idx)
doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
}()
ticker := time.NewTicker(time.Second)
var (
attempt calibrationAttemptResult
throttleReason string
)
attemptLoop:
for {
select {
case attempt = <-doneCh:
break attemptLoop
case <-ticker.C:
afterThrottle, err := queryThrottleCounters(idx)
if err != nil { if err != nil {
continue continue
} }
// Record the throttle reason but do NOT cancel the dcgmi // Record throttle but do NOT cancel — let dcgmi finish so
// process. Killing it mid-run leaves nv-hostengine holding // nv-hostengine releases the slot cleanly before the next attempt.
// the diagnostic slot, which causes DCGM_ST_IN_USE on every if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
// subsequent attempt. Let targeted_power run to its natural throttleReasons[s.idx] = reason
// end so the daemon releases the slot cleanly before we logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
// reduce power and retry.
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
throttleReason = reason
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
} }
case <-ctx.Done():
cancel()
attempt = <-doneCh
break attemptLoop
} }
case <-ctx.Done():
cancelAttempt()
ar = <-doneCh
break attemptLoop
} }
ticker.Stop() }
cancel() ticker.Stop()
_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644) cancelAttempt()
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
perGPU := filterRowsByGPU(attempt.rows, idx) // Resource busy: retry with exponential back-off (shared — one DCGM session).
if ar.err != nil && isDCGMResourceBusy(ar.err) {
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
for _, s := range active {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
s.converged = true
}
logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
break calibDone
}
busyRetries++
// Undo attempt counter: busy retries don't count as real attempts.
for _, s := range active {
s.calib.Attempts--
}
logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
select {
case <-ctx.Done():
break calibDone
case <-time.After(time.Duration(busyDelaySec) * time.Second):
}
next := busyDelaySec * 2
if next > dcgmResourceBusyMaxDelaySec {
next = dcgmResourceBusyMaxDelaySec + 1
}
busyDelaySec = next
sharedAttempt-- // retry same logical attempt number
continue
}
busyRetries = 0
busyDelaySec = 1
// Per-GPU analysis and binary search update.
for _, s := range active {
perGPU := filterRowsByGPU(ar.rows, s.idx)
summary := summarizeBenchmarkTelemetry(perGPU) summary := summarizeBenchmarkTelemetry(perGPU)
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 { throttle := throttleReasons[s.idx]
// Stable at appliedLimitW: record it and binary-search upward.
calib.Summary = summary // Cooling warning: thermal throttle with fans not at maximum.
calib.Completed = true if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
calib.AppliedPowerLimitW = float64(appliedLimitW) clocks := make([]float64, 0, len(perGPU))
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) var fanDutyValues []float64
lo = appliedLimitW fanDutyAvail := false
// If there is still headroom to search, try a higher level. for _, r := range perGPU {
if canDerate && hi-lo > calibSearchTolerance { if r.ClockMHz > 0 {
nextLimitW := roundTo5W((lo + hi) / 2) clocks = append(clocks, r.ClockMHz)
if nextLimitW > lo && nextLimitW < hi { }
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil { if r.FanDutyCycleAvailable {
appliedLimitW = nextLimitW fanDutyAvail = true
calib.AppliedPowerLimitW = float64(appliedLimitW) fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi)) }
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW)) }
continue calibLoop dropPct := benchmarkClockDrift(clocks)
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
s.calib.CoolingWarning = fmt.Sprintf(
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
throttle, dropPct, p95FanDuty,
)
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
}
}
if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
// Stable at current limit — update lo and binary-search upward.
s.calib.Summary = summary
s.calib.Completed = true
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
s.lo = s.appliedLimitW
if canDerate && s.hi-s.lo > calibSearchTolerance {
next := roundTo5W((s.lo + s.hi) / 2)
if next > s.lo && next < s.hi {
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
s.appliedLimitW = next
s.calib.AppliedPowerLimitW = float64(next)
s.calib.Completed = false // keep searching
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
continue // next GPU in active list
} }
} }
} }
break s.converged = true
continue
} }
// If DCGM reports the resource is in use, nv-hostengine has not yet // Failed or throttled — log and binary-search downward.
// released the diagnostic slot from the previous attempt. Do not
// derate: wait with exponential back-off and retry at the same
// power limit. Once the back-off delay would exceed
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
// held by something else.
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
break
}
busyRetries++
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
select {
case <-ctx.Done():
break calibLoop
case <-time.After(time.Duration(busyDelaySec) * time.Second):
}
next := busyDelaySec * 2
if next > dcgmResourceBusyMaxDelaySec {
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
}
busyDelaySec = next
continue calibLoop
}
busyRetries = 0 // reset on any non-busy outcome
busyDelaySec = 1 // reset back-off
switch { switch {
case throttleReason != "": case throttle != "":
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW)) logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
// Check whether the thermal throttle coincided with fans below case ar.err != nil:
// maximum: that combination suggests cooling misconfiguration s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
// rather than a fundamental power-delivery limit. logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
clocks := make([]float64, 0, len(perGPU))
var fanDutyValues []float64
fanDutyAvail := false
for _, r := range perGPU {
if r.ClockMHz > 0 {
clocks = append(clocks, r.ClockMHz)
}
if r.FanDutyCycleAvailable {
fanDutyAvail = true
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
}
}
dropPct := benchmarkClockDrift(clocks)
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
calib.CoolingWarning = fmt.Sprintf(
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
throttleReason, dropPct, p95FanDuty,
)
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
}
}
case attempt.err != nil:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
default: default:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW)) logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
} }
if !canDerate || appliedLimitW <= 0 { if !canDerate || s.appliedLimitW <= 0 {
break s.converged = true
continue
} }
// Binary-search for the highest stable power limit. s.hi = s.appliedLimitW
// This attempt failed or throttled, so update the upper bound.
hi = appliedLimitW
if hi-lo <= calibSearchTolerance { if s.hi-s.lo <= calibSearchTolerance {
// Search range exhausted: lo is the highest verified-stable level. if s.lo > s.minLimitW {
if lo > minLimitW { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi)) if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil { s.appliedLimitW = s.lo
appliedLimitW = lo s.calib.AppliedPowerLimitW = float64(s.lo)
calib.AppliedPowerLimitW = float64(lo) s.calib.Derated = s.lo < s.originalLimitW
calib.Derated = lo < originalLimitW
} }
} else { } else {
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
} }
break s.converged = true
continue
} }
// Binary midpoint within the remaining search range. next := roundTo5W((s.lo + s.hi) / 2)
nextLimitW := roundTo5W((lo + hi) / 2) if next <= s.lo {
// Ensure the candidate is strictly inside the search range. next = s.lo + calibSearchTolerance
if nextLimitW <= lo {
nextLimitW = lo + calibSearchTolerance
} }
if nextLimitW >= hi { if next >= s.hi {
nextLimitW = (lo + hi) / 2 next = (s.lo + s.hi) / 2
} }
if nextLimitW < minLimitW { if next < s.minLimitW {
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
break s.converged = true
continue
} }
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil { if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error()) s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err)) logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
break s.converged = true
continue
} }
appliedLimitW = nextLimitW s.appliedLimitW = next
calib.AppliedPowerLimitW = float64(appliedLimitW) s.calib.AppliedPowerLimitW = float64(next)
calib.Derated = appliedLimitW < originalLimitW s.calib.Derated = next < s.originalLimitW
info.PowerLimitW = float64(appliedLimitW) s.info.PowerLimitW = float64(next)
infoByIndex[idx] = info infoByIndex[s.idx] = s.info
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi)) logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
} }
}
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 { for _, s := range states {
results[idx] = calib if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
results[s.idx] = s.calib
} }
} }
return results, restore return results, restore
@@ -2771,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
} }
} }
func occupiedSlots(indices []int, current int) []int {
out := make([]int, 0, len(indices))
for _, idx := range indices {
if idx != current {
out = append(out, idx)
}
}
return out
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo { func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src)) out := make(map[int]benchmarkGPUInfo, len(src))
@@ -2827,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
b.WriteString("\n") b.WriteString("\n")
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name) fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
if gpu.OccupiedSlotsNote != "" {
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
}
for _, note := range gpu.Notes { for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note) fmt.Fprintf(&b, "- %s\n", note)
} }
@@ -2895,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
} }
durationSec := powerBenchDurationSec(opts.Profile) durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec _ = durationSec
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc) // Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
var allRestoreActions []benchmarkRestoreAction
for _, idx := range selected {
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok {
calibByIndex[idx] = r
}
}
defer func() { defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- { for i := len(allRestoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn() allRestoreActions[i].fn()
} }
}() }()
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected)) gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
@@ -2915,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.OverallStatus = "PARTIAL" result.OverallStatus = "PARTIAL"
} }
} }
occupied := occupiedSlots(selected, idx)
note := ""
if len(occupied) > 0 {
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
}
gpus = append(gpus, NvidiaPowerBenchGPU{ gpus = append(gpus, NvidiaPowerBenchGPU{
Index: idx, Index: idx,
Name: info.Name, Name: info.Name,
@@ -2931,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
CalibrationAttempts: calib.Attempts, CalibrationAttempts: calib.Attempts,
Derated: calib.Derated, Derated: calib.Derated,
Status: status, Status: status,
OccupiedSlots: occupied,
OccupiedSlotsNote: note,
Notes: append([]string(nil), calib.Notes...), Notes: append([]string(nil), calib.Notes...),
CoolingWarning: calib.CoolingWarning, CoolingWarning: calib.CoolingWarning,
}) })
@@ -2972,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
for _, gpu := range gpus { for _, gpu := range gpus {
singleByIndex[gpu.Index] = gpu singleByIndex[gpu.Index] = gpu
} }
// Phase 2: ramp — add one GPU per step and calibrate the growing subset
// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
// targeted_power with derating if degradation is detected.
for step := 1; step <= len(result.RecommendedSlotOrder); step++ { for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...) subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step)) stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755) _ = os.MkdirAll(stepDir, 0755)
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) var stepCalib map[int]benchmarkPowerCalibrationResult
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc) if step == 1 {
for i := len(stepRestore) - 1; i >= 0; i-- { // Single-GPU step — already measured in phase 1; reuse directly.
stepRestore[i].fn() stepCalib = calibByIndex
logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
} else {
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
var stepRestore []benchmarkRestoreAction
stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
for i := len(stepRestore) - 1; i >= 0; i-- {
stepRestore[i].fn()
}
} }
ramp := NvidiaPowerBenchStep{ ramp := NvidiaPowerBenchStep{
StepIndex: step, StepIndex: step,

View File

@@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct {
CalibrationAttempts int `json:"calibration_attempts,omitempty"` CalibrationAttempts int `json:"calibration_attempts,omitempty"`
Derated bool `json:"derated,omitempty"` Derated bool `json:"derated,omitempty"`
Status string `json:"status"` Status string `json:"status"`
OccupiedSlots []int `json:"occupied_slots,omitempty"`
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow. // CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
CoolingWarning string `json:"cooling_warning,omitempty"` CoolingWarning string `json:"cooling_warning,omitempty"`

View File

@@ -552,9 +552,13 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
if passes <= 0 { if passes <= 0 {
passes = 1 passes = 1
} }
// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
// controller can cause memtester to spin forever on a single subtest.
timeoutSec := sizeMB*passes*150/100 + 120
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}}, {name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
{name: "03-free-after.log", cmd: []string{"free", "-h"}}, {name: "03-free-after.log", cmd: []string{"free", "-h"}},
}, logFunc) }, logFunc)
} }

View File

@@ -1529,6 +1529,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
writeJSON(w, map[string]string{"status": "rolled back"}) writeJSON(w, map[string]string{"status": "rolled back"})
} }
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
}
func (h *handler) rollbackPendingNetworkChange() error { func (h *handler) rollbackPendingNetworkChange() error {
h.pendingNetMu.Lock() h.pendingNetMu.Lock()
pnc := h.pendingNet pnc := h.pendingNet

View File

@@ -2002,7 +2002,7 @@ func renderBenchmark(opts HandlerOptions) string {
</div> </div>
</div> </div>
` + renderBenchmarkResultsCard(opts.ExportDir) + ` `+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
<div id="benchmark-output" style="display:none;margin-top:16px" class="card"> <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div> <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2188,7 +2188,9 @@ function runNvidiaBenchmark(kind) {
if (e.data) failures += 1; if (e.data) failures += 1;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n'; term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight; term.scrollTop = term.scrollHeight;
const isLast = (idx + 1 >= taskIds.length);
streamNext(idx + 1, failures); streamNext(idx + 1, failures);
if (isLast) { benchmarkRefreshResults(); }
}); });
benchmarkES.onerror = function() { benchmarkES.onerror = function() {
if (benchmarkES) { if (benchmarkES) {
@@ -2208,18 +2210,30 @@ function runNvidiaBenchmark(kind) {
} }
benchmarkLoadGPUs(); benchmarkLoadGPUs();
function benchmarkRefreshResults() {
fetch('/api/benchmark/results')
.then(function(r) { return r.text(); })
.then(function(html) {
const el = document.getElementById('benchmark-results-section');
if (el) el.innerHTML = html;
})
.catch(function() {});
}
</script>` </script>`
} }
func renderBenchmarkResultsCard(exportDir string) string { func renderBenchmarkResultsCard(exportDir string) string {
maxIdx, runs := loadBenchmarkHistory(exportDir) maxIdx, runs := loadBenchmarkHistory(exportDir)
return renderBenchmarkResultsCardFromRuns( perf := renderBenchmarkResultsCardFromRuns(
"Perf Results", "Performance Results",
"Composite score by saved benchmark run and GPU.", "Composite score by saved benchmark run and GPU.",
"No saved benchmark runs yet.", "No saved performance benchmark runs yet.",
maxIdx, maxIdx,
runs, runs,
) )
power := renderPowerBenchmarkResultsCard(exportDir)
return perf + "\n" + power
} }
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string { func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
@@ -2299,6 +2313,126 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
return maxGPUIndex, runs return maxGPUIndex, runs
} }
func renderPowerBenchmarkResultsCard(exportDir string) string {
baseDir := app.DefaultBeeBenchPowerDir
if strings.TrimSpace(exportDir) != "" {
baseDir = filepath.Join(exportDir, "bee-bench", "power")
}
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
if err != nil || len(paths) == 0 {
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
}
sort.Strings(paths)
type powerRun struct {
generatedAt time.Time
displayTime string
result platform.NvidiaPowerBenchResult
}
var runs []powerRun
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
var r platform.NvidiaPowerBenchResult
if err := json.Unmarshal(raw, &r); err != nil {
continue
}
runs = append(runs, powerRun{
generatedAt: r.GeneratedAt,
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
result: r,
})
}
sort.Slice(runs, func(i, j int) bool {
return runs[i].generatedAt.After(runs[j].generatedAt)
})
// Show only the most recent run's GPU slot table, plus a run history summary.
var b strings.Builder
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
latest := runs[0].result
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
if latest.Hostname != "" {
b.WriteString(`` + html.EscapeString(latest.Hostname))
}
if latest.OverallStatus != "" {
statusColor := "var(--ok)"
if latest.OverallStatus != "OK" {
statusColor = "var(--warn)"
}
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
}
b.WriteString(`</p>`)
if len(latest.GPUs) > 0 {
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
b.WriteString(`</tr></thead><tbody>`)
for _, gpu := range latest.GPUs {
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
rowStyle := ""
achievedStyle := ""
if derated {
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
achievedStyle = ` style="color:#e6a000;font-weight:600"`
}
statusLabel := gpu.Status
if statusLabel == "" {
statusLabel = "OK"
}
statusColor := "var(--ok)"
if statusLabel != "OK" {
statusColor = "var(--warn)"
}
nominalStr := "-"
if gpu.DefaultPowerLimitW > 0 {
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
}
achievedStr := "-"
if gpu.AppliedPowerLimitW > 0 {
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
}
p95Str := "-"
if gpu.MaxObservedPowerW > 0 {
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
}
b.WriteString(`<tr` + rowStyle + `>`)
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
b.WriteString(`<td>` + nominalStr + `</td>`)
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
b.WriteString(`<td>` + p95Str + `</td>`)
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
b.WriteString(`</tr>`)
}
b.WriteString(`</tbody></table></div>`)
}
if len(runs) > 1 {
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
for i, run := range runs {
statusColor := "var(--ok)"
if run.result.OverallStatus != "OK" {
statusColor = "var(--warn)"
}
b.WriteString(`<tr>`)
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
b.WriteString(`</tr>`)
}
b.WriteString(`</tbody></table></div></details>`)
}
b.WriteString(`</div></div>`)
return b.String()
}
// ── Burn ────────────────────────────────────────────────────────────────────── // ── Burn ──────────────────────────────────────────────────────────────────────
func renderBurn() string { func renderBurn() string {

View File

@@ -263,6 +263,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort) mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf")) mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power")) mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
// Tasks // Tasks
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList) mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)

View File

@@ -21,3 +21,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
COMGR_VERSION=2.8.0.60304-76~22.04 COMGR_VERSION=2.8.0.60304-76~22.04
GO_VERSION=1.24.0 GO_VERSION=1.24.0
AUDIT_VERSION=1.0.0 AUDIT_VERSION=1.0.0
MEMTEST_VERSION=6.10-4

View File

@@ -23,9 +23,9 @@ lb config noauto \
--bootloaders "grub-efi,syslinux" \ --bootloaders "grub-efi,syslinux" \
--debian-installer none \ --debian-installer none \
--archive-areas "main contrib non-free non-free-firmware" \ --archive-areas "main contrib non-free non-free-firmware" \
--mirror-bootstrap "https://deb.debian.org/debian" \ --mirror-bootstrap "http://mirror.mephi.ru/debian/" \
--mirror-chroot "https://deb.debian.org/debian" \ --mirror-chroot "http://mirror.mephi.ru/debian/" \
--mirror-binary "https://deb.debian.org/debian" \ --mirror-binary "http://mirror.mephi.ru/debian/" \
--security true \ --security true \
--linux-flavours "amd64" \ --linux-flavours "amd64" \
--linux-packages "${LB_LINUX_PACKAGES}" \ --linux-packages "${LB_LINUX_PACKAGES}" \

View File

@@ -161,6 +161,7 @@ run_variant() {
-e GOMODCACHE=/cache/go-mod \ -e GOMODCACHE=/cache/go-mod \
-e TMPDIR=/cache/tmp \ -e TMPDIR=/cache/tmp \
-e BEE_CACHE_DIR=/cache/bee \ -e BEE_CACHE_DIR=/cache/bee \
-e BEE_REQUIRE_MEMTEST=1 \
-w /work \ -w /work \
"${IMAGE_REF}" \ "${IMAGE_REF}" \
sh /work/iso/builder/build.sh --variant "${_v}" \ sh /work/iso/builder/build.sh --variant "${_v}" \
@@ -175,6 +176,7 @@ run_variant() {
-e GOMODCACHE=/cache/go-mod \ -e GOMODCACHE=/cache/go-mod \
-e TMPDIR=/cache/tmp \ -e TMPDIR=/cache/tmp \
-e BEE_CACHE_DIR=/cache/bee \ -e BEE_CACHE_DIR=/cache/bee \
-e BEE_REQUIRE_MEMTEST=1 \
-w /work \ -w /work \
"${IMAGE_REF}" \ "${IMAGE_REF}" \
sh /work/iso/builder/build.sh --variant "${_v}" sh /work/iso/builder/build.sh --variant "${_v}"

View File

@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
. "${BUILDER_DIR}/VERSIONS" . "${BUILDER_DIR}/VERSIONS"
export MEMTEST_VERSION
export PATH="$PATH:/usr/local/go/bin" export PATH="$PATH:/usr/local/go/bin"
: "${BEE_REQUIRE_MEMTEST:=0}" : "${BEE_REQUIRE_MEMTEST:=0}"
@@ -775,6 +776,7 @@ run_optional_step_sh() {
return 0 return 0
fi fi
mkdir -p "${LOG_DIR}" 2>/dev/null || true
step_log="${LOG_DIR}/${step_slug}.log" step_log="${LOG_DIR}/${step_slug}.log"
echo "" echo ""
echo "=== optional step: ${step_name} ===" echo "=== optional step: ${step_name} ==="
@@ -798,13 +800,14 @@ start_build_log
# install them on the fly so NVIDIA modules and ISO kernel always match. # install them on the fly so NVIDIA modules and ISO kernel always match.
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
echo "=== refreshing apt index to detect current kernel ABI ===" echo "=== refreshing apt index to detect current kernel ABI ==="
apt-get update -qq apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \ DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \ | awk '/Depends:.*linux-image-[0-9]/{print $2}' \
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \ | grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
| head -1) | head -1)
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2 echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
exit 1 exit 1
fi fi
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ===" echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="

View File

@@ -5,6 +5,8 @@ set -e
: "${BEE_REQUIRE_MEMTEST:=0}" : "${BEE_REQUIRE_MEMTEST:=0}"
# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi" MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
BINARY_BOOT_DIR="binary/boot" BINARY_BOOT_DIR="binary/boot"
GRUB_CFG="binary/boot/grub/grub.cfg" GRUB_CFG="binary/boot/grub/grub.cfg"
@@ -26,13 +28,13 @@ fail_or_warn() {
copy_memtest_file() { copy_memtest_file() {
src="$1" src="$1"
base="$(basename "$src")" dst_name="${2:-$(basename "$src")}"
dst="${BINARY_BOOT_DIR}/${base}" dst="${BINARY_BOOT_DIR}/${dst_name}"
[ -f "$src" ] || return 1 [ -f "$src" ] || return 1
mkdir -p "${BINARY_BOOT_DIR}" mkdir -p "${BINARY_BOOT_DIR}"
cp "$src" "$dst" cp "$src" "$dst"
log "copied ${base} from ${src}" log "copied ${dst_name} from ${src}"
} }
extract_memtest_from_deb() { extract_memtest_from_deb() {
@@ -41,14 +43,42 @@ extract_memtest_from_deb() {
log "extracting memtest payload from ${deb}" log "extracting memtest payload from ${deb}"
dpkg-deb -x "$deb" "$tmpdir" dpkg-deb -x "$deb" "$tmpdir"
for f in ${MEMTEST_FILES}; do
if [ -f "${tmpdir}/boot/${f}" ]; then # EFI binary: both 5.x and 6.x use memtest86+x64.efi
copy_memtest_file "${tmpdir}/boot/${f}" if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
fi copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
done fi
# BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
fi
rm -rf "$tmpdir" rm -rf "$tmpdir"
} }
download_and_extract_memtest() {
tmpdl="$(mktemp -d)"
ver_arg=""
if [ -n "${MEMTEST_VERSION:-}" ]; then
ver_arg="=memtest86+=${MEMTEST_VERSION}"
log "downloading memtest86+=${MEMTEST_VERSION} from apt"
else
log "downloading memtest86+ from apt (no version pinned)"
fi
# shellcheck disable=SC2086
( cd "$tmpdl" && apt-get download "memtest86+${ver_arg}" ) 2>/dev/null || true
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
if [ -n "$deb" ]; then
extract_memtest_from_deb "$deb"
else
log "apt download of memtest86+ failed"
fi
rm -rf "$tmpdl"
}
ensure_memtest_binaries() { ensure_memtest_binaries() {
missing=0 missing=0
for f in ${MEMTEST_FILES}; do for f in ${MEMTEST_FILES}; do
@@ -56,10 +86,15 @@ ensure_memtest_binaries() {
done done
[ "$missing" -eq 1 ] || return 0 [ "$missing" -eq 1 ] || return 0
# 1. Try files already placed by lb binary_memtest or chroot
for root in chroot/boot /boot; do for root in chroot/boot /boot; do
for f in ${MEMTEST_FILES}; do for f in ${MEMTEST_FILES}; do
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true [ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
done done
# 6.x BIOS binary may lack x64 in name — copy with normalised name
if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
fi
done done
missing=0 missing=0
@@ -68,6 +103,7 @@ ensure_memtest_binaries() {
done done
[ "$missing" -eq 1 ] || return 0 [ "$missing" -eq 1 ] || return 0
# 2. Try apt package cache (may be empty if lb binary_memtest already purged)
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
[ -d "$root" ] || continue [ -d "$root" ] || continue
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)" deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
@@ -76,6 +112,15 @@ ensure_memtest_binaries() {
break break
done done
missing=0
for f in ${MEMTEST_FILES}; do
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
done
[ "$missing" -eq 1 ] || return 0
# 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
download_and_extract_memtest
missing=0 missing=0
for f in ${MEMTEST_FILES}; do for f in ${MEMTEST_FILES}; do
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then