Fix memtest hook: bad ver_arg format in apt-get download

ver_arg was set to "=memtest86+=VERSION" making the command "apt-get download memtest86+=memtest86+=VERSION" (invalid). Fixed to build pkg_spec directly as "memtest86+=VERSION". Also add apt-get update retry if initial download fails. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Re-enable security repo: kernel 6.1.0-44 is in bookworm-security only
2026-04-15 10:15:01 +03:00 · 2026-04-15 10:02:52 +03:00 · 2026-04-15 09:57:29 +03:00 · 2026-04-15 07:28:36 +03:00 · 2026-04-15 07:16:18 +03:00 · 2026-04-14 23:47:57 +03:00
11 changed files with 549 additions and 163 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -49,6 +49,10 @@ type benchmarkPowerCalibrationResult struct {
 	Derated            bool
 	Completed          bool
 	Notes              []string
 	// CoolingWarning is set when the GPU throttled thermally with a clock drop
 	// ≥20% while server fans were below 100% duty cycle — a signal that the
 	// cooling system may not be correctly configured for full GPU load.
 	CoolingWarning string
 }
 type benchmarkBurnProfile struct {
@@ -344,6 +348,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				gpuResult.PowerCalibrationTries = calib.Attempts
 				gpuResult.PowerLimitDerated = calib.Derated
 				gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
 				if calib.CoolingWarning != "" {
 					gpuResult.CoolingWarning = calib.CoolingWarning
 				}
 			}
 			if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 				gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -1625,7 +1632,15 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 			case "power_capped":
 				findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
 			case "thermal_limited":
-				findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
+				msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
 				if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
 					result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
 					msg += fmt.Sprintf(
 						" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
 						result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
 					)
 				}
 				findings = append(findings, msg)
 			case "sync_boost_limited":
 				findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
 			case "low_sm_clock_vs_target":
@@ -1642,6 +1657,12 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 				findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
 			}
 		}
 		if gpu.CoolingWarning != "" {
 			findings = append(findings, fmt.Sprintf(
 				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
 				gpu.Index, gpu.CoolingWarning,
 			))
 		}
 		if len(gpu.PrecisionFailures) > 0 {
 			findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
 		}
@@ -2044,6 +2065,9 @@ func runNvidiaBenchmarkParallel(
 			r.PowerCalibrationTries = calib.Attempts
 			r.PowerLimitDerated = calib.Derated
 			r.Notes = append(r.Notes, calib.Notes...)
 			if calib.CoolingWarning != "" {
 				r.CoolingWarning = calib.CoolingWarning
 			}
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -2448,8 +2472,10 @@ func runBenchmarkPowerCalibration(
 	logFunc func(string),
 ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
 	const calibDurationSec = 120
 	const derateStepW = 25
 	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
 	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
 	const calibSearchTolerance = 10
 	// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
 	// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
 	// doubling each retry until it would exceed the cap, at which point the
@@ -2472,8 +2498,25 @@ func runBenchmarkPowerCalibration(
 		err  error
 	}
 	// gpuCalibState holds per-GPU binary search state during parallel calibration.
 	type gpuCalibState struct {
 		idx            int
 		info           benchmarkGPUInfo
 		originalLimitW int
 		appliedLimitW  int
 		minLimitW      int
 		lo             int // highest verified-stable limit (assumed: minLimitW)
 		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
 		calib          benchmarkPowerCalibrationResult
 		converged      bool
 	}
 	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
 	var restore []benchmarkRestoreAction
 	// Initialise per-GPU state.
 	states := make([]*gpuCalibState, 0, len(gpuIndices))
 	for _, idx := range gpuIndices {
 		info := infoByIndex[idx]
 		originalLimitW := int(math.Round(info.PowerLimitW))
@@ -2499,15 +2542,20 @@ func runBenchmarkPowerCalibration(
 		case appliedLimitW > 0:
 			minLimitW = appliedLimitW - maxDerateW
 		}
-		if minLimitW < derateStepW {
+		if minLimitW < calibSearchTolerance {
-			minLimitW = derateStepW
+			minLimitW = calibSearchTolerance
 		}
-
+		s := &gpuCalibState{
-		calib := benchmarkPowerCalibrationResult{
+			idx:            idx,
-			AppliedPowerLimitW: float64(appliedLimitW),
+			info:           info,
 			originalLimitW: originalLimitW,
 			appliedLimitW:  appliedLimitW,
 			minLimitW:      minLimitW,
 			lo:             minLimitW,
 			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
 			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
-		busyRetries := 0
+		states = append(states, s)
 		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
@@ -2518,125 +2566,243 @@ func runBenchmarkPowerCalibration(
 				},
 			})
 		}
 	}
-	calibLoop:
+	// Shared DCGM resource-busy back-off state (single diagnostic session).
 	busyRetries := 0
 	busyDelaySec := 1
 	sharedAttempt := 0
 	type sharedAttemptResult struct {
 		out  []byte
 		rows []GPUMetricRow
 		err  error
 	}
 calibDone:
 	for {
 		// Collect non-converged GPUs.
 		var active []*gpuCalibState
 		for _, s := range states {
 			if !s.converged {
 				active = append(active, s)
 			}
 		}
 		if len(active) == 0 || ctx.Err() != nil {
 			break
 		}
 		sharedAttempt++
 		for _, s := range active {
 			s.calib.Attempts++
 			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
 		}
 		// Snapshot throttle counters for all active GPUs before the run.
 		beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
 		for _, s := range active {
 			beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
 		}
 		// Run targeted_power for ALL gpuIndices simultaneously so every card
 		// is under load during calibration — this reflects real server thermals.
 		logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
 		cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
 		attemptCtx, cancelAttempt := context.WithCancel(ctx)
 		doneCh := make(chan sharedAttemptResult, 1)
 		go func() {
 			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
 			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
 		}()
 		ticker := time.NewTicker(time.Second)
 		throttleReasons := make(map[int]string, len(active))
 		var ar sharedAttemptResult
 	attemptLoop:
 		for {
-			calib.Attempts++
+			select {
-			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
+			case ar = <-doneCh:
-
+				break attemptLoop
-			beforeThrottle, _ := queryThrottleCounters(idx)
+			case <-ticker.C:
-			attemptCtx, cancel := context.WithCancel(ctx)
+				// Poll throttle counters for each active GPU independently.
-			doneCh := make(chan calibrationAttemptResult, 1)
+				for _, s := range active {
-			logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
+					if throttleReasons[s.idx] != "" {
-			cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
+						continue // already detected for this GPU
-			go func() {
+					}
-				out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
+					after, err := queryThrottleCounters(s.idx)
 				doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
 			}()
 			ticker := time.NewTicker(time.Second)
 			var (
 				attempt        calibrationAttemptResult
 				throttleReason string
 			)
 		attemptLoop:
 			for {
 				select {
 				case attempt = <-doneCh:
 					break attemptLoop
 				case <-ticker.C:
 					afterThrottle, err := queryThrottleCounters(idx)
 					if err != nil {
 						continue
 					}
-					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
+					// Record throttle but do NOT cancel — let dcgmi finish so
-						throttleReason = reason
+					// nv-hostengine releases the slot cleanly before the next attempt.
-						cancel()
+					if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
 						throttleReasons[s.idx] = reason
 						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
 					}
 				case <-ctx.Done():
 					cancel()
 					attempt = <-doneCh
 					break attemptLoop
 				}
 			case <-ctx.Done():
 				cancelAttempt()
 				ar = <-doneCh
 				break attemptLoop
 			}
 			ticker.Stop()
 			cancel()
 			_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
 			perGPU := filterRowsByGPU(attempt.rows, idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
 			if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
 				calib.Summary = summary
 				calib.Completed = true
 				calib.AppliedPowerLimitW = float64(appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				break
 			}
 			// If DCGM reports the resource is in use, nv-hostengine has not yet
 			// released the diagnostic slot from the previous attempt. Do not
 			// derate: wait with exponential back-off and retry at the same
 			// power limit. Once the back-off delay would exceed
 			// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
 			// held by something else.
 			if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
 				if busyDelaySec > dcgmResourceBusyMaxDelaySec {
 					calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
 					logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
 					break
 				}
 				busyRetries++
 				logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
 				select {
 				case <-ctx.Done():
 					break calibLoop
 				case <-time.After(time.Duration(busyDelaySec) * time.Second):
 				}
 				next := busyDelaySec * 2
 				if next > dcgmResourceBusyMaxDelaySec {
 					next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
 				}
 				busyDelaySec = next
 				continue calibLoop
 			}
 			busyRetries = 0    // reset on any non-busy outcome
 			busyDelaySec = 1   // reset back-off
 			switch {
 			case throttleReason != "":
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
 			case attempt.err != nil:
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
 				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
 			default:
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
 			}
 			if !canDerate || appliedLimitW <= 0 {
 				break
 			}
 			nextLimitW := appliedLimitW - derateStepW
 			if nextLimitW < minLimitW {
 				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
 				break
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
 				calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
 				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
 				break
 			}
 			appliedLimitW = nextLimitW
 			calib.AppliedPowerLimitW = float64(appliedLimitW)
 			calib.Derated = true
 			info.PowerLimitW = float64(appliedLimitW)
 			infoByIndex[idx] = info
 			calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
 		}
 		ticker.Stop()
 		cancelAttempt()
 		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
-		if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
+		// Resource busy: retry with exponential back-off (shared — one DCGM session).
-			results[idx] = calib
+		if ar.err != nil && isDCGMResourceBusy(ar.err) {
 			if busyDelaySec > dcgmResourceBusyMaxDelaySec {
 				for _, s := range active {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
 					s.converged = true
 				}
 				logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
 				break calibDone
 			}
 			busyRetries++
 			// Undo attempt counter: busy retries don't count as real attempts.
 			for _, s := range active {
 				s.calib.Attempts--
 			}
 			logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
 			select {
 			case <-ctx.Done():
 				break calibDone
 			case <-time.After(time.Duration(busyDelaySec) * time.Second):
 			}
 			next := busyDelaySec * 2
 			if next > dcgmResourceBusyMaxDelaySec {
 				next = dcgmResourceBusyMaxDelaySec + 1
 			}
 			busyDelaySec = next
 			sharedAttempt-- // retry same logical attempt number
 			continue
 		}
 		busyRetries = 0
 		busyDelaySec = 1
 		// Per-GPU analysis and binary search update.
 		for _, s := range active {
 			perGPU := filterRowsByGPU(ar.rows, s.idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
 			throttle := throttleReasons[s.idx]
 			// Cooling warning: thermal throttle with fans not at maximum.
 			if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
 				clocks := make([]float64, 0, len(perGPU))
 				var fanDutyValues []float64
 				fanDutyAvail := false
 				for _, r := range perGPU {
 					if r.ClockMHz > 0 {
 						clocks = append(clocks, r.ClockMHz)
 					}
 					if r.FanDutyCycleAvailable {
 						fanDutyAvail = true
 						fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
 					}
 				}
 				dropPct := benchmarkClockDrift(clocks)
 				p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
 				if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
 					s.calib.CoolingWarning = fmt.Sprintf(
 						"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
 						throttle, dropPct, p95FanDuty,
 					)
 					logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
 				}
 			}
 			if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
 				// Stable at current limit — update lo and binary-search upward.
 				s.calib.Summary = summary
 				s.calib.Completed = true
 				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				s.lo = s.appliedLimitW
 				if canDerate && s.hi-s.lo > calibSearchTolerance {
 					next := roundTo5W((s.lo + s.hi) / 2)
 					if next > s.lo && next < s.hi {
 						if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
 							s.appliedLimitW = next
 							s.calib.AppliedPowerLimitW = float64(next)
 							s.calib.Completed = false // keep searching
 							s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
 							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
 							continue // next GPU in active list
 						}
 					}
 				}
 				s.converged = true
 				continue
 			}
 			// Failed or throttled — log and binary-search downward.
 			switch {
 			case throttle != "":
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
 			case ar.err != nil:
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
 				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
 			default:
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
 			}
 			if !canDerate || s.appliedLimitW <= 0 {
 				s.converged = true
 				continue
 			}
 			s.hi = s.appliedLimitW
 			if s.hi-s.lo <= calibSearchTolerance {
 				if s.lo > s.minLimitW {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
 					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
 						s.appliedLimitW = s.lo
 						s.calib.AppliedPowerLimitW = float64(s.lo)
 						s.calib.Derated = s.lo < s.originalLimitW
 					}
 				} else {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				}
 				s.converged = true
 				continue
 			}
 			next := roundTo5W((s.lo + s.hi) / 2)
 			if next <= s.lo {
 				next = s.lo + calibSearchTolerance
 			}
 			if next >= s.hi {
 				next = (s.lo + s.hi) / 2
 			}
 			if next < s.minLimitW {
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				s.converged = true
 				continue
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
 				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
 				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
 				s.converged = true
 				continue
 			}
 			s.appliedLimitW = next
 			s.calib.AppliedPowerLimitW = float64(next)
 			s.calib.Derated = next < s.originalLimitW
 			s.info.PowerLimitW = float64(next)
 			infoByIndex[s.idx] = s.info
 			s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
 			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
 		}
 	}
 	for _, s := range states {
 		if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
 			results[s.idx] = s.calib
 		}
 	}
 	return results, restore
@@ -2649,6 +2815,11 @@ func isDCGMResourceBusy(err error) bool {
 	return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
 }
 // roundTo5W rounds w to the nearest 5 W boundary.
 func roundTo5W(w int) int {
 	return ((w + 2) / 5) * 5
 }
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability:
@@ -2660,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
 	}
 }
 func occupiedSlots(indices []int, current int) []int {
 	out := make([]int, 0, len(indices))
 	for _, idx := range indices {
 		if idx != current {
 			out = append(out, idx)
 		}
 	}
 	return out
 }
 func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
 	out := make(map[int]benchmarkGPUInfo, len(src))
@@ -2716,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	b.WriteString("\n")
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
-		if gpu.OccupiedSlotsNote != "" {
+
 			fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
 		}
 		for _, note := range gpu.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
@@ -2784,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)
 	_ = durationSec
-	calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
+	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
 	var allRestoreActions []benchmarkRestoreAction
 	for _, idx := range selected {
 		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
 		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
 		}
 	}
 	defer func() {
-		for i := len(restoreActions) - 1; i >= 0; i-- {
+		for i := len(allRestoreActions) - 1; i >= 0; i-- {
-			restoreActions[i].fn()
+			allRestoreActions[i].fn()
 		}
 	}()
 	gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
@@ -2804,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 				result.OverallStatus = "PARTIAL"
 			}
 		}
 		occupied := occupiedSlots(selected, idx)
 		note := ""
 		if len(occupied) > 0 {
 			note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
 		}
 		gpus = append(gpus, NvidiaPowerBenchGPU{
 			Index:               idx,
 			Name:                info.Name,
@@ -2820,9 +2989,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			CalibrationAttempts: calib.Attempts,
 			Derated:             calib.Derated,
 			Status:              status,
 			OccupiedSlots:       occupied,
 			OccupiedSlotsNote:   note,
 			Notes:               append([]string(nil), calib.Notes...),
 			CoolingWarning:      calib.CoolingWarning,
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool {
@@ -2849,19 +3017,37 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		if gpu.Derated {
 			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
 		}
 		if gpu.CoolingWarning != "" {
 			result.Findings = append(result.Findings, fmt.Sprintf(
 				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
 				gpu.Index, gpu.CoolingWarning,
 			))
 		}
 	}
 	singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
 	for _, gpu := range gpus {
 		singleByIndex[gpu.Index] = gpu
 	}
 	// Phase 2: ramp — add one GPU per step and calibrate the growing subset
 	// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
 	// targeted_power with derating if degradation is detected.
 	for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
 		subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
 		stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
 		_ = os.MkdirAll(stepDir, 0755)
-		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
+		var stepCalib map[int]benchmarkPowerCalibrationResult
-		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
+		if step == 1 {
-		for i := len(stepRestore) - 1; i >= 0; i-- {
+			// Single-GPU step — already measured in phase 1; reuse directly.
-			stepRestore[i].fn()
+			stepCalib = calibByIndex
 			logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
 		} else {
 			stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 			var stepRestore []benchmarkRestoreAction
 			stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
 			for i := len(stepRestore) - 1; i >= 0; i-- {
 				stepRestore[i].fn()
 			}
 		}
 		ramp := NvidiaPowerBenchStep{
 			StepIndex:  step,
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -131,6 +131,9 @@ type BenchmarkGPUResult struct {
 	Scores             BenchmarkScorecard         `json:"scores"`
 	DegradationReasons []string                   `json:"degradation_reasons,omitempty"`
 	Notes              []string                   `json:"notes,omitempty"`
 	// CoolingWarning is non-empty when a thermal throttle event occurred with
 	// a clock drop ≥20% while server fans were not at 100% duty cycle.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 type BenchmarkTelemetrySummary struct {
@@ -277,9 +280,9 @@ type NvidiaPowerBenchGPU struct {
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
 	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 type NvidiaPowerBenchStep struct {
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -552,9 +552,13 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
 	if passes <= 0 {
 		passes = 1
 	}
 	// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
 	// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
 	// controller can cause memtester to spin forever on a single subtest.
 	timeoutSec := sizeMB*passes*150/100 + 120
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
-		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
+		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
 		{name: "03-free-after.log", cmd: []string{"free", "-h"}},
 	}, logFunc)
 }
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -1529,6 +1529,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
 	writeJSON(w, map[string]string{"status": "rolled back"})
 }
 func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "text/html; charset=utf-8")
 	fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
 }
 func (h *handler) rollbackPendingNetworkChange() error {
 	h.pendingNetMu.Lock()
 	pnc := h.pendingNet
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -2002,7 +2002,7 @@ func renderBenchmark(opts HandlerOptions) string {
  </div>
 </div>
-` + renderBenchmarkResultsCard(opts.ExportDir) + `
+`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2188,7 +2188,9 @@ function runNvidiaBenchmark(kind) {
        if (e.data) failures += 1;
        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
        term.scrollTop = term.scrollHeight;
        const isLast = (idx + 1 >= taskIds.length);
        streamNext(idx + 1, failures);
        if (isLast) { benchmarkRefreshResults(); }
      });
      benchmarkES.onerror = function() {
        if (benchmarkES) {
@@ -2208,18 +2210,30 @@ function runNvidiaBenchmark(kind) {
 }
 benchmarkLoadGPUs();
 function benchmarkRefreshResults() {
  fetch('/api/benchmark/results')
    .then(function(r) { return r.text(); })
    .then(function(html) {
      const el = document.getElementById('benchmark-results-section');
      if (el) el.innerHTML = html;
    })
    .catch(function() {});
 }
 </script>`
 }
 func renderBenchmarkResultsCard(exportDir string) string {
 	maxIdx, runs := loadBenchmarkHistory(exportDir)
-	return renderBenchmarkResultsCardFromRuns(
+	perf := renderBenchmarkResultsCardFromRuns(
-		"Perf Results",
+		"Performance Results",
 		"Composite score by saved benchmark run and GPU.",
-		"No saved benchmark runs yet.",
+		"No saved performance benchmark runs yet.",
 		maxIdx,
 		runs,
 	)
 	power := renderPowerBenchmarkResultsCard(exportDir)
 	return perf + "\n" + power
 }
 func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
@@ -2299,6 +2313,126 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
 	return maxGPUIndex, runs
 }
 func renderPowerBenchmarkResultsCard(exportDir string) string {
 	baseDir := app.DefaultBeeBenchPowerDir
 	if strings.TrimSpace(exportDir) != "" {
 		baseDir = filepath.Join(exportDir, "bee-bench", "power")
 	}
 	paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
 	if err != nil || len(paths) == 0 {
 		return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
 	}
 	sort.Strings(paths)
 	type powerRun struct {
 		generatedAt time.Time
 		displayTime string
 		result      platform.NvidiaPowerBenchResult
 	}
 	var runs []powerRun
 	for _, path := range paths {
 		raw, err := os.ReadFile(path)
 		if err != nil {
 			continue
 		}
 		var r platform.NvidiaPowerBenchResult
 		if err := json.Unmarshal(raw, &r); err != nil {
 			continue
 		}
 		runs = append(runs, powerRun{
 			generatedAt: r.GeneratedAt,
 			displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
 			result:      r,
 		})
 	}
 	sort.Slice(runs, func(i, j int) bool {
 		return runs[i].generatedAt.After(runs[j].generatedAt)
 	})
 	// Show only the most recent run's GPU slot table, plus a run history summary.
 	var b strings.Builder
 	b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
 	latest := runs[0].result
 	b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
 	if latest.Hostname != "" {
 		b.WriteString(` — ` + html.EscapeString(latest.Hostname))
 	}
 	if latest.OverallStatus != "" {
 		statusColor := "var(--ok)"
 		if latest.OverallStatus != "OK" {
 			statusColor = "var(--warn)"
 		}
 		b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
 	}
 	b.WriteString(`</p>`)
 	if len(latest.GPUs) > 0 {
 		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
 		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
 		b.WriteString(`</tr></thead><tbody>`)
 		for _, gpu := range latest.GPUs {
 			derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
 			rowStyle := ""
 			achievedStyle := ""
 			if derated {
 				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
 				achievedStyle = ` style="color:#e6a000;font-weight:600"`
 			}
 			statusLabel := gpu.Status
 			if statusLabel == "" {
 				statusLabel = "OK"
 			}
 			statusColor := "var(--ok)"
 			if statusLabel != "OK" {
 				statusColor = "var(--warn)"
 			}
 			nominalStr := "-"
 			if gpu.DefaultPowerLimitW > 0 {
 				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
 			}
 			achievedStr := "-"
 			if gpu.AppliedPowerLimitW > 0 {
 				achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
 			}
 			p95Str := "-"
 			if gpu.MaxObservedPowerW > 0 {
 				p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
 			}
 			b.WriteString(`<tr` + rowStyle + `>`)
 			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
 			b.WriteString(`<td>` + nominalStr + `</td>`)
 			b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
 			b.WriteString(`<td>` + p95Str + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
 			b.WriteString(`</tr>`)
 		}
 		b.WriteString(`</tbody></table></div>`)
 	}
 	if len(runs) > 1 {
 		b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
 		b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
 		for i, run := range runs {
 			statusColor := "var(--ok)"
 			if run.result.OverallStatus != "OK" {
 				statusColor = "var(--warn)"
 			}
 			b.WriteString(`<tr>`)
 			b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
 			b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
 			b.WriteString(`</tr>`)
 		}
 		b.WriteString(`</tbody></table></div></details>`)
 	}
 	b.WriteString(`</div></div>`)
 	return b.String()
 }
 // ── Burn ──────────────────────────────────────────────────────────────────────
 func renderBurn() string {
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -263,6 +263,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
 	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -21,3 +21,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
 COMGR_VERSION=2.8.0.60304-76~22.04
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
 MEMTEST_VERSION=6.10-4
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -23,9 +23,9 @@ lb config noauto \
    --bootloaders "grub-efi,syslinux" \
    --debian-installer none \
    --archive-areas "main contrib non-free non-free-firmware" \
-    --mirror-bootstrap "https://deb.debian.org/debian" \
+    --mirror-bootstrap "http://mirror.mephi.ru/debian/" \
-    --mirror-chroot "https://deb.debian.org/debian" \
+    --mirror-chroot "http://mirror.mephi.ru/debian/" \
-    --mirror-binary "https://deb.debian.org/debian" \
+    --mirror-binary "http://mirror.mephi.ru/debian/" \
    --security true \
    --linux-flavours "amd64" \
    --linux-packages "${LB_LINUX_PACKAGES}" \
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -161,6 +161,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}" \
@@ -175,6 +176,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}"
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
 export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
 . "${BUILDER_DIR}/VERSIONS"
 export MEMTEST_VERSION
 export PATH="$PATH:/usr/local/go/bin"
 : "${BEE_REQUIRE_MEMTEST:=0}"
@@ -775,6 +776,7 @@ run_optional_step_sh() {
        return 0
    fi
    mkdir -p "${LOG_DIR}" 2>/dev/null || true
    step_log="${LOG_DIR}/${step_slug}.log"
    echo ""
    echo "=== optional step: ${step_name} ==="
@@ -798,13 +800,14 @@ start_build_log
 # install them on the fly so NVIDIA modules and ISO kernel always match.
 if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
    echo "=== refreshing apt index to detect current kernel ABI ==="
-    apt-get update -qq
+    apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
    DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
        | awk '/Depends:.*linux-image-[0-9]/{print $2}' \
        | grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
        | head -1)
    if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
        echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
        echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
        exit 1
    fi
    echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
--- a/iso/builder/config/hooks/normal/9100-memtest.hook.binary
+++ b/iso/builder/config/hooks/normal/9100-memtest.hook.binary
@@ -5,6 +5,8 @@ set -e
 : "${BEE_REQUIRE_MEMTEST:=0}"
 # memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
 # while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
 MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
 BINARY_BOOT_DIR="binary/boot"
 GRUB_CFG="binary/boot/grub/grub.cfg"
@@ -26,13 +28,13 @@ fail_or_warn() {
 copy_memtest_file() {
    src="$1"
-    base="$(basename "$src")"
+    dst_name="${2:-$(basename "$src")}"
-    dst="${BINARY_BOOT_DIR}/${base}"
+    dst="${BINARY_BOOT_DIR}/${dst_name}"
    [ -f "$src" ] || return 1
    mkdir -p "${BINARY_BOOT_DIR}"
    cp "$src" "$dst"
-    log "copied ${base} from ${src}"
+    log "copied ${dst_name} from ${src}"
 }
 extract_memtest_from_deb() {
@@ -41,14 +43,44 @@ extract_memtest_from_deb() {
    log "extracting memtest payload from ${deb}"
    dpkg-deb -x "$deb" "$tmpdir"
-    for f in ${MEMTEST_FILES}; do
+
-        if [ -f "${tmpdir}/boot/${f}" ]; then
+    # EFI binary: both 5.x and 6.x use memtest86+x64.efi
-            copy_memtest_file "${tmpdir}/boot/${f}"
+    if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
-        fi
+        copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
-    done
+    fi
    # BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
    if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
        copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
    elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
        copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
    fi
    rm -rf "$tmpdir"
 }
 download_and_extract_memtest() {
    tmpdl="$(mktemp -d)"
    if [ -n "${MEMTEST_VERSION:-}" ]; then
        pkg_spec="memtest86+=${MEMTEST_VERSION}"
    else
        pkg_spec="memtest86+"
    fi
    log "downloading ${pkg_spec} from apt"
    if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
        log "apt download failed, retrying after apt-get update"
        apt-get update -qq >/dev/null 2>&1 || true
        ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
    fi
    deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
    if [ -n "$deb" ]; then
        extract_memtest_from_deb "$deb"
    else
        log "apt download of memtest86+ failed"
    fi
    rm -rf "$tmpdl"
 }
 ensure_memtest_binaries() {
    missing=0
    for f in ${MEMTEST_FILES}; do
@@ -56,10 +88,15 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0
    # 1. Try files already placed by lb binary_memtest or chroot
    for root in chroot/boot /boot; do
        for f in ${MEMTEST_FILES}; do
            [ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
        done
        # 6.x BIOS binary may lack x64 in name — copy with normalised name
        if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
            copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
        fi
    done
    missing=0
@@ -68,6 +105,7 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0
    # 2. Try apt package cache (may be empty if lb binary_memtest already purged)
    for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
        [ -d "$root" ] || continue
        deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
@@ -76,6 +114,15 @@ ensure_memtest_binaries() {
        break
    done
    missing=0
    for f in ${MEMTEST_FILES}; do
        [ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
    done
    [ "$missing" -eq 1 ] || return 0
    # 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
    download_and_extract_memtest
    missing=0
    for f in ${MEMTEST_FILES}; do
        if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
Author	SHA1	Message	Date
Mikhail Chusavitin	1c5cb45698	Fix memtest hook: bad ver_arg format in apt-get download ver_arg was set to "=memtest86+=VERSION" making the command "apt-get download memtest86+=memtest86+=VERSION" (invalid). Fixed to build pkg_spec directly as "memtest86+=VERSION". Also add apt-get update retry if initial download fails. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 10:15:01 +03:00
Mikhail Chusavitin	090b92ca73	Re-enable security repo: kernel 6.1.0-44 is in bookworm-security only Disabling --security broke the build because linux-image-6.1.0-44-amd64 is a security update not present in the base bookworm repo. Main packages already come from mirror.mephi.ru. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 10:02:52 +03:00
Mikhail Chusavitin	2dccbc010c	Use MEPHI mirror, disable security repo, fix memtest in ISO build - Switch all lb mirrors to mirror.mephi.ru/debian/ for faster/reliable downloads - Disable security repo (--security false) — not needed for LiveCD - Pin MEMTEST_VERSION=6.10-4 in VERSIONS, export to hook environment - Set BEE_REQUIRE_MEMTEST=1 in build-in-container.sh — missing memtest is now fatal - Fix 9100-memtest.hook.binary: add apt-get download fallback when lb binary_memtest has already purged the package cache; handle both 5.x (memtest86+x64.bin) and 6.x (memtest86+.bin) BIOS binary naming Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 09:57:29 +03:00
Michael Chus	e84c69d360	Fix optional step log dir missing after memtest recovery mkdir -p LOG_DIR before writing the optional step log so that a race with cleanup_build_log (EXIT trap archiving the log dir) does not cause a "Directory nonexistent" error during lb binary_checksums / lb binary_iso. Also downgrade apt-get update failure to a warning so a transient mirror outage does not block kernel ABI auto-detection when the apt cache is warm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 07:28:36 +03:00
Michael Chus	c80a39e7ac	Add power results table, fix benchmark results refresh, bound memtester - Benchmark page now shows two result sections: Performance (scores) and Power / Thermal Fit (slot table). After any benchmark task completes the results section auto-refreshes via GET /api/benchmark/results without a full page reload. - Power results table shows each GPU slot with nominal TDP, achieved stable power limit, and P95 observed power. Rows with derated cards are highlighted amber so under-performing slots stand out at a glance. Older runs are collapsed in a <details> summary. - memtester is now wrapped with timeout(1) so a stuck memory controller cannot cause Validate Memory to hang indefinitely. Wall-clock limit is ~2.5 min per 100 MB per pass plus a 2-minute buffer. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 07:16:18 +03:00
Michael Chus	a5e0261ff2	Refactor power ramp to use true single-card baselines Phase 1 now calibrates each GPU individually (sequentially) so that PowerRealizationPct reflects real degradation from neighbour thermals and shared power rails. Previously the baseline came from an all-GPU-together run, making realization always ≈100% at the final ramp step. Ramp step 1 reuses single-card calibration results (no extra run); steps 2..N run targeted_power on the growing GPU subset with derating active. Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper — they were compensation for the old all-GPU calibration approach. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 23:47:57 +03:00
Michael Chus	ee422ede3c	Revert "Add raster Easy Bee branding assets" This reverts commit `d560b2fead`.	2026-04-14 23:00:15 +03:00
Michael Chus	d560b2fead	Add raster Easy Bee branding assets	2026-04-14 22:39:25 +03:00
Michael Chus	3cf2e9c9dc	Run power calibration for all GPUs simultaneously Previously each GPU was calibrated sequentially (one card fully done before the next started), producing the staircase temperature pattern seen on the graph. Now all GPUs run together in a single dcgmi diag -r targeted_power session per attempt. This means: - All cards are under realistic thermal load at the same time. - A single DCGM session handles the run — no resource-busy contention from concurrent dcgmi processes. - Binary search state (lo/hi) is tracked independently per GPU; each card converges to its own highest stable power limit. - Throttle counter polling covers all active GPUs in the shared ticker. - Resource-busy exponential back-off is shared (one DCGM session). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:25:05 +03:00
Michael Chus	19dbabd71d	Simplify power calibration: pure binary search, no telemetry guessing Remove telemetry-guided initial candidate; use strict binary search midpoint at every step. Clean and predictable convergence in O(log N) attempts within the allowed power range [minLimitW, startingLimitW]. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:12:45 +03:00
Michael Chus	a6a07f2626	Replace linear power derate with binary search + telemetry-guided jump Power calibration previously stepped down 25 W at a time (linear), requiring up to 6 attempts to find a stable limit within 150 W range. New strategy: - Binary search between minLimitW (lo, assumed stable floor) and the starting/failed limit (hi, confirmed unstable), converging within a 10 W tolerance in ~4 attempts. - For thermal throttle: the first-quarter telemetry rows estimate the GPU's pre-throttle power draw. nextLimit = round5W(onset - 10 W) is used as the initial candidate instead of the binary midpoint, landing much closer to the true limit on the first step. - On success: lo is updated and a higher level is tried (binary search upward) until hi-lo ≤ tolerance, ensuring the highest stable limit is found rather than the first stable one. - Let targeted_power run to natural completion on throttle (no mid-run SIGKILL) so nv-hostengine releases its diagnostic slot cleanly before the next attempt. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:05:23 +03:00
Michael Chus	f87461ee4a	Detect thermal throttle with fans below 100% as cooling misconfiguration During power calibration: if a thermal throttle (sw_thermal/hw_thermal) causes ≥20% clock drop while server fans are below 98% P95 duty cycle, record a CoolingWarning on the GPU result and emit an actionable finding telling the operator to rerun with fans manually fixed at 100%. During steady-state benchmark: same signal enriches the existing thermal_limited finding with fan duty cycle and clock drift values. Covers both the main benchmark (buildBenchmarkFindings) and the power bench (NvidiaPowerBenchResult.Findings). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 21:44:57 +03:00