Revert "Pre-download DCGM/fabricmanager debs on host to bypass chroot apt"

This reverts commit 4110dbf8a6.
Pre-download DCGM/fabricmanager debs on host to bypass chroot apt
2026-04-15 17:19:53 +03:00 · 2026-04-15 17:10:23 +03:00 · 2026-04-15 16:14:26 +03:00 · 2026-04-15 14:32:04 +03:00 · 2026-04-15 12:29:11 +03:00 · 2026-04-15 10:33:22 +03:00
18 changed files with 613 additions and 310 deletions
--- a/audit/go.mod
+++ b/audit/go.mod
@@ -5,22 +5,18 @@ go 1.25.0
 replace reanimator/chart => ../internal/chart

 require (
-	github.com/go-analyze/charts v0.5.26
+	modernc.org/sqlite v1.48.0
 	reanimator/chart v0.0.0-00010101000000-000000000000
 )

 require (
 	github.com/dustin/go-humanize v1.0.1 // indirect
-	github.com/go-analyze/bulk v0.1.3 // indirect
-	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
-	golang.org/x/image v0.24.0 // indirect
 	golang.org/x/sys v0.42.0 // indirect
-	modernc.org/libc v1.70.0 // indirect
+	modernc.org/libc v1.72.0 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
-	modernc.org/sqlite v1.48.0 // indirect
 )
--- a/audit/go.sum
+++ b/audit/go.sum
@@ -1,37 +1,51 @@
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
-github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
-github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
-github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
+github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
 github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
-github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
-github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
-golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
-golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
+golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
+golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
 golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
-modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
+golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
+modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
+modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
+modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
+modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
+modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
+modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
+modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
+modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
+modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
+modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
+modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
+modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
+modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
+modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
 modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
 modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
 modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
 modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
+modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
+modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
+modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
+modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
 modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
 modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
+modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
+modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
+modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
+modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
 	"bee-selfheal.service",
 	"bee-selfheal.timer",
 	"bee-sshsetup.service",
+	"nvidia-dcgm.service",
+	"nvidia-fabricmanager.service",
 }

 var supportBundleCommands = []struct {
@@ -48,6 +50,43 @@ else
 fi
 `}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
+if command -v nvidia-smi >/dev/null 2>&1; then
+  nvidia-smi topo -m 2>&1 || true
+else
+  echo "nvidia-smi not found"
+fi
+`}},
+	{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
+if ! command -v systemctl >/dev/null 2>&1; then
+  echo "systemctl not found"
+  exit 0
+fi
+echo "=== unit files ==="
+systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
+echo
+echo "=== active units ==="
+systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
+echo
+echo "=== failed units ==="
+systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
+`}},
+	{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
+for candidate in \
+  /usr/bin/nvidia-fabricmanager \
+  /usr/bin/nv-fabricmanager \
+  /usr/bin/nvidia-fabricmanagerd \
+  /usr/bin/nvlsm; do
+  if [ -e "$candidate" ]; then
+    echo "=== $candidate ==="
+    ls -l "$candidate" 2>&1 || true
+    echo
+  fi
+done
+if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
+  echo "no fabric manager binaries found"
+fi
+`}},
 	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
 if ! command -v lspci >/dev/null 2>&1; then
  echo "lspci not found"
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
 }{
 	{name: "system/kern.log", src: "/var/log/kern.log"},
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
+	{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
+	{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
+	{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
+	{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
 }

 const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -2476,9 +2476,6 @@ func runBenchmarkPowerCalibration(
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
 	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
 	const calibSearchTolerance = 10
-	// calibPreThrottleMarginW is subtracted from the telemetry-estimated
-	// pre-throttle power draw to produce a smarter initial search candidate.
-	const calibPreThrottleMarginW = 10
 	// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
 	// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
 	// doubling each retry until it would exceed the cap, at which point the
@@ -2501,8 +2498,25 @@ func runBenchmarkPowerCalibration(
 		err  error
 	}

+
+	// gpuCalibState holds per-GPU binary search state during parallel calibration.
+	type gpuCalibState struct {
+		idx            int
+		info           benchmarkGPUInfo
+		originalLimitW int
+		appliedLimitW  int
+		minLimitW      int
+		lo             int // highest verified-stable limit (assumed: minLimitW)
+		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
+		calib          benchmarkPowerCalibrationResult
+		converged      bool
+	}
+
 	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
 	var restore []benchmarkRestoreAction
+
+	// Initialise per-GPU state.
+	states := make([]*gpuCalibState, 0, len(gpuIndices))
 	for _, idx := range gpuIndices {
 		info := infoByIndex[idx]
 		originalLimitW := int(math.Round(info.PowerLimitW))
@@ -2531,17 +2545,17 @@ func runBenchmarkPowerCalibration(
 		if minLimitW < calibSearchTolerance {
 			minLimitW = calibSearchTolerance
 		}
-
-		calib := benchmarkPowerCalibrationResult{
-			AppliedPowerLimitW: float64(appliedLimitW),
+		s := &gpuCalibState{
+			idx:            idx,
+			info:           info,
+			originalLimitW: originalLimitW,
+			appliedLimitW:  appliedLimitW,
+			minLimitW:      minLimitW,
+			lo:             minLimitW,
+			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
+			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
-		// Binary search bounds for finding the highest stable power limit.
-		// lo = highest verified-stable level (assumed: minLimitW).
-		// hi = lowest verified-unstable level (assumed: above the starting limit).
-		lo := minLimitW
-		hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
-		busyRetries := 0
-		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
+		states = append(states, s)
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
@@ -2552,212 +2566,243 @@ func runBenchmarkPowerCalibration(
 				},
 			})
 		}
+	}

-	calibLoop:
+	// Shared DCGM resource-busy back-off state (single diagnostic session).
+	busyRetries := 0
+	busyDelaySec := 1
+	sharedAttempt := 0
+
+	type sharedAttemptResult struct {
+		out  []byte
+		rows []GPUMetricRow
+		err  error
+	}
+
+calibDone:
+	for {
+		// Collect non-converged GPUs.
+		var active []*gpuCalibState
+		for _, s := range states {
+			if !s.converged {
+				active = append(active, s)
+			}
+		}
+		if len(active) == 0 || ctx.Err() != nil {
+			break
+		}
+
+		sharedAttempt++
+		for _, s := range active {
+			s.calib.Attempts++
+			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
+		}
+
+		// Snapshot throttle counters for all active GPUs before the run.
+		beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
+		for _, s := range active {
+			beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
+		}
+
+		// Run targeted_power for ALL gpuIndices simultaneously so every card
+		// is under load during calibration — this reflects real server thermals.
+		logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
+		cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
+		attemptCtx, cancelAttempt := context.WithCancel(ctx)
+		doneCh := make(chan sharedAttemptResult, 1)
+		go func() {
+			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
+			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
+		}()
+
+		ticker := time.NewTicker(time.Second)
+		throttleReasons := make(map[int]string, len(active))
+		var ar sharedAttemptResult
+
+	attemptLoop:
 		for {
-			calib.Attempts++
-			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
-
-			beforeThrottle, _ := queryThrottleCounters(idx)
-			attemptCtx, cancel := context.WithCancel(ctx)
-			doneCh := make(chan calibrationAttemptResult, 1)
-			logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
-			cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
-			go func() {
-				out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
-				doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
-			}()
-
-			ticker := time.NewTicker(time.Second)
-			var (
-				attempt        calibrationAttemptResult
-				throttleReason string
-			)
-		attemptLoop:
-			for {
-				select {
-				case attempt = <-doneCh:
-					break attemptLoop
-				case <-ticker.C:
-					afterThrottle, err := queryThrottleCounters(idx)
+			select {
+			case ar = <-doneCh:
+				break attemptLoop
+			case <-ticker.C:
+				// Poll throttle counters for each active GPU independently.
+				for _, s := range active {
+					if throttleReasons[s.idx] != "" {
+						continue // already detected for this GPU
+					}
+					after, err := queryThrottleCounters(s.idx)
 					if err != nil {
 						continue
 					}
-					// Record the throttle reason but do NOT cancel the dcgmi
-					// process. Killing it mid-run leaves nv-hostengine holding
-					// the diagnostic slot, which causes DCGM_ST_IN_USE on every
-					// subsequent attempt. Let targeted_power run to its natural
-					// end so the daemon releases the slot cleanly before we
-					// reduce power and retry.
-					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
-						throttleReason = reason
-						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
+					// Record throttle but do NOT cancel — let dcgmi finish so
+					// nv-hostengine releases the slot cleanly before the next attempt.
+					if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
+						throttleReasons[s.idx] = reason
+						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
 					}
-				case <-ctx.Done():
-					cancel()
-					attempt = <-doneCh
-					break attemptLoop
 				}
+			case <-ctx.Done():
+				cancelAttempt()
+				ar = <-doneCh
+				break attemptLoop
 			}
-			ticker.Stop()
-			cancel()
-			_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
+		}
+		ticker.Stop()
+		cancelAttempt()
+		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)

-			perGPU := filterRowsByGPU(attempt.rows, idx)
+		// Resource busy: retry with exponential back-off (shared — one DCGM session).
+		if ar.err != nil && isDCGMResourceBusy(ar.err) {
+			if busyDelaySec > dcgmResourceBusyMaxDelaySec {
+				for _, s := range active {
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
+					s.converged = true
+				}
+				logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
+				break calibDone
+			}
+			busyRetries++
+			// Undo attempt counter: busy retries don't count as real attempts.
+			for _, s := range active {
+				s.calib.Attempts--
+			}
+			logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
+			select {
+			case <-ctx.Done():
+				break calibDone
+			case <-time.After(time.Duration(busyDelaySec) * time.Second):
+			}
+			next := busyDelaySec * 2
+			if next > dcgmResourceBusyMaxDelaySec {
+				next = dcgmResourceBusyMaxDelaySec + 1
+			}
+			busyDelaySec = next
+			sharedAttempt-- // retry same logical attempt number
+			continue
+		}
+		busyRetries = 0
+		busyDelaySec = 1
+
+		// Per-GPU analysis and binary search update.
+		for _, s := range active {
+			perGPU := filterRowsByGPU(ar.rows, s.idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
-			if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
-				// Stable at appliedLimitW: record it and binary-search upward.
-				calib.Summary = summary
-				calib.Completed = true
-				calib.AppliedPowerLimitW = float64(appliedLimitW)
-				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
-				lo = appliedLimitW
-				// If there is still headroom to search, try a higher level.
-				if canDerate && hi-lo > calibSearchTolerance {
-					nextLimitW := roundTo5W((lo + hi) / 2)
-					if nextLimitW > lo && nextLimitW < hi {
-						if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
-							appliedLimitW = nextLimitW
-							calib.AppliedPowerLimitW = float64(appliedLimitW)
-							calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
-							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
-							continue calibLoop
+			throttle := throttleReasons[s.idx]
+
+			// Cooling warning: thermal throttle with fans not at maximum.
+			if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
+				clocks := make([]float64, 0, len(perGPU))
+				var fanDutyValues []float64
+				fanDutyAvail := false
+				for _, r := range perGPU {
+					if r.ClockMHz > 0 {
+						clocks = append(clocks, r.ClockMHz)
+					}
+					if r.FanDutyCycleAvailable {
+						fanDutyAvail = true
+						fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
+					}
+				}
+				dropPct := benchmarkClockDrift(clocks)
+				p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
+				if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
+					s.calib.CoolingWarning = fmt.Sprintf(
+						"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
+						throttle, dropPct, p95FanDuty,
+					)
+					logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
+				}
+			}
+
+			if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
+				// Stable at current limit — update lo and binary-search upward.
+				s.calib.Summary = summary
+				s.calib.Completed = true
+				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
+				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
+				s.lo = s.appliedLimitW
+				if canDerate && s.hi-s.lo > calibSearchTolerance {
+					next := roundTo5W((s.lo + s.hi) / 2)
+					if next > s.lo && next < s.hi {
+						if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
+							s.appliedLimitW = next
+							s.calib.AppliedPowerLimitW = float64(next)
+							s.calib.Completed = false // keep searching
+							s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
+							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
+							continue // next GPU in active list
 						}
 					}
 				}
-				break
+				s.converged = true
+				continue
 			}

-			// If DCGM reports the resource is in use, nv-hostengine has not yet
-			// released the diagnostic slot from the previous attempt. Do not
-			// derate: wait with exponential back-off and retry at the same
-			// power limit. Once the back-off delay would exceed
-			// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
-			// held by something else.
-			if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
-				if busyDelaySec > dcgmResourceBusyMaxDelaySec {
-					calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
-					logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
-					break
-				}
-				busyRetries++
-				logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
-				select {
-				case <-ctx.Done():
-					break calibLoop
-				case <-time.After(time.Duration(busyDelaySec) * time.Second):
-				}
-				next := busyDelaySec * 2
-				if next > dcgmResourceBusyMaxDelaySec {
-					next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
-				}
-				busyDelaySec = next
-				continue calibLoop
-			}
-			busyRetries = 0    // reset on any non-busy outcome
-			busyDelaySec = 1   // reset back-off
-
+			// Failed or throttled — log and binary-search downward.
 			switch {
-			case throttleReason != "":
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
-				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
-				// Check whether the thermal throttle coincided with fans below
-				// maximum: that combination suggests cooling misconfiguration
-				// rather than a fundamental power-delivery limit.
-				if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
-					clocks := make([]float64, 0, len(perGPU))
-					var fanDutyValues []float64
-					fanDutyAvail := false
-					for _, r := range perGPU {
-						if r.ClockMHz > 0 {
-							clocks = append(clocks, r.ClockMHz)
-						}
-						if r.FanDutyCycleAvailable {
-							fanDutyAvail = true
-							fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
-						}
-					}
-					dropPct := benchmarkClockDrift(clocks)
-					p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
-					if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
-						calib.CoolingWarning = fmt.Sprintf(
-							"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
-							throttleReason, dropPct, p95FanDuty,
-						)
-						logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
-					}
-				}
-			case attempt.err != nil:
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
-				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
+			case throttle != "":
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
+				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
+			case ar.err != nil:
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
+				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
 			default:
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
-				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
+				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
 			}

-			if !canDerate || appliedLimitW <= 0 {
-				break
+			if !canDerate || s.appliedLimitW <= 0 {
+				s.converged = true
+				continue
 			}
-			// Binary-search for the highest stable power limit.
-			// This attempt failed or throttled, so update the upper bound.
-			hi = appliedLimitW
+			s.hi = s.appliedLimitW

-			if hi-lo <= calibSearchTolerance {
-				// Search range exhausted: lo is the highest verified-stable level.
-				if lo > minLimitW {
-					calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
-					if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
-						appliedLimitW = lo
-						calib.AppliedPowerLimitW = float64(lo)
-						calib.Derated = lo < originalLimitW
+			if s.hi-s.lo <= calibSearchTolerance {
+				if s.lo > s.minLimitW {
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
+					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
+						s.appliedLimitW = s.lo
+						s.calib.AppliedPowerLimitW = float64(s.lo)
+						s.calib.Derated = s.lo < s.originalLimitW
 					}
 				} else {
-					calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				}
-				break
+				s.converged = true
+				continue
 			}

-			// Compute the next candidate.
-			// For thermal throttle: use the pre-throttle power draw from telemetry
-			// as a smarter initial estimate instead of the binary midpoint — it
-			// lands much closer to the true limit on the first attempt.
-			nextLimitW := (lo + hi) / 2
-			if strings.Contains(throttleReason, "thermal") {
-				if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 {
-					candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW)
-					if candidate > lo && candidate < hi {
-						nextLimitW = candidate
-					}
-				}
+			next := roundTo5W((s.lo + s.hi) / 2)
+			if next <= s.lo {
+				next = s.lo + calibSearchTolerance
 			}
-			nextLimitW = roundTo5W(nextLimitW)
-			// Ensure the candidate is strictly inside the search range.
-			if nextLimitW <= lo {
-				nextLimitW = lo + calibSearchTolerance
+			if next >= s.hi {
+				next = (s.lo + s.hi) / 2
 			}
-			if nextLimitW >= hi {
-				nextLimitW = (lo + hi) / 2
+			if next < s.minLimitW {
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+				s.converged = true
+				continue
 			}
-			if nextLimitW < minLimitW {
-				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
-				break
+			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
+				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
+				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
+				s.converged = true
+				continue
 			}
-			if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
-				calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
-				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
-				break
-			}
-			appliedLimitW = nextLimitW
-			calib.AppliedPowerLimitW = float64(appliedLimitW)
-			calib.Derated = appliedLimitW < originalLimitW
-			info.PowerLimitW = float64(appliedLimitW)
-			infoByIndex[idx] = info
-			calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
-			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
+			s.appliedLimitW = next
+			s.calib.AppliedPowerLimitW = float64(next)
+			s.calib.Derated = next < s.originalLimitW
+			s.info.PowerLimitW = float64(next)
+			infoByIndex[s.idx] = s.info
+			s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
+			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
 		}
+	}

-		if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
-			results[idx] = calib
+	for _, s := range states {
+		if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
+			results[s.idx] = s.calib
 		}
 	}
 	return results, restore
@@ -2770,28 +2815,6 @@ func isDCGMResourceBusy(err error) bool {
 	return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
 }

-// calibPreThrottlePowerW estimates the GPU power draw just before thermal
-// throttle onset by averaging the first quarter of telemetry rows. The early
-// samples capture the GPU at peak before clock/power reduction kicks in.
-func calibPreThrottlePowerW(rows []GPUMetricRow) float64 {
-	if len(rows) < 4 {
-		return 0
-	}
-	n := len(rows) / 4
-	var sum float64
-	var cnt int
-	for _, r := range rows[:n] {
-		if r.PowerW > 0 {
-			sum += r.PowerW
-			cnt++
-		}
-	}
-	if cnt == 0 {
-		return 0
-	}
-	return sum / float64(cnt)
-}
-
 // roundTo5W rounds w to the nearest 5 W boundary.
 func roundTo5W(w int) int {
 	return ((w + 2) / 5) * 5
@@ -2808,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
 	}
 }

-func occupiedSlots(indices []int, current int) []int {
-	out := make([]int, 0, len(indices))
-	for _, idx := range indices {
-		if idx != current {
-			out = append(out, idx)
-		}
-	}
-	return out
-}

 func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
 	out := make(map[int]benchmarkGPUInfo, len(src))
@@ -2864,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	b.WriteString("\n")
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
-		if gpu.OccupiedSlotsNote != "" {
-			fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
-		}
+
 		for _, note := range gpu.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
@@ -2932,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)
 	_ = durationSec
-	calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
+	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
+	// establish a true single-card power baseline unaffected by neighbour heat.
+	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
+	var allRestoreActions []benchmarkRestoreAction
+	for _, idx := range selected {
+		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
+		_ = os.MkdirAll(singleDir, 0755)
+		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
+		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
+		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
+		allRestoreActions = append(allRestoreActions, restore...)
+		if r, ok := c[idx]; ok {
+			calibByIndex[idx] = r
+		}
+	}
 	defer func() {
-		for i := len(restoreActions) - 1; i >= 0; i-- {
-			restoreActions[i].fn()
+		for i := len(allRestoreActions) - 1; i >= 0; i-- {
+			allRestoreActions[i].fn()
 		}
 	}()
 	gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
@@ -2952,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 				result.OverallStatus = "PARTIAL"
 			}
 		}
-		occupied := occupiedSlots(selected, idx)
-		note := ""
-		if len(occupied) > 0 {
-			note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
-		}
 		gpus = append(gpus, NvidiaPowerBenchGPU{
 			Index:               idx,
 			Name:                info.Name,
@@ -2968,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			CalibrationAttempts: calib.Attempts,
 			Derated:             calib.Derated,
 			Status:              status,
-			OccupiedSlots:       occupied,
-			OccupiedSlotsNote:   note,
 			Notes:               append([]string(nil), calib.Notes...),
 			CoolingWarning:      calib.CoolingWarning,
 		})
@@ -3009,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	for _, gpu := range gpus {
 		singleByIndex[gpu.Index] = gpu
 	}
+
+	// Phase 2: ramp — add one GPU per step and calibrate the growing subset
+	// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
+	// targeted_power with derating if degradation is detected.
 	for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
 		subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
 		stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
 		_ = os.MkdirAll(stepDir, 0755)
-		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
-		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
-		for i := len(stepRestore) - 1; i >= 0; i-- {
-			stepRestore[i].fn()
+		var stepCalib map[int]benchmarkPowerCalibrationResult
+		if step == 1 {
+			// Single-GPU step — already measured in phase 1; reuse directly.
+			stepCalib = calibByIndex
+			logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
+		} else {
+			stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
+			var stepRestore []benchmarkRestoreAction
+			stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
+			for i := len(stepRestore) - 1; i >= 0; i-- {
+				stepRestore[i].fn()
+			}
 		}
 		ramp := NvidiaPowerBenchStep{
 			StepIndex:  step,
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct {
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
-	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
-	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
 	"bee-audit",
 	"bee-web",
 	"bee-sshsetup",
+	"nvidia-dcgm",
+	"nvidia-fabricmanager",
 }

 func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -552,9 +552,13 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
 	if passes <= 0 {
 		passes = 1
 	}
+	// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
+	// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
+	// controller can cause memtester to spin forever on a single subtest.
+	timeoutSec := sizeMB*passes*150/100 + 120
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
-		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
+		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
 		{name: "03-free-after.log", cmd: []string{"free", "-h"}},
 	}, logFunc)
 }
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 		}

 		if rampUp && len(body.GPUIndices) > 1 {
-			// Ramp-up mode: resolve GPU list, then create one task per prefix
-			// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
+			// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
+			// in Phase 2 (one additional GPU per step). A single task with all
+			// selected GPUs is sufficient — spawning N tasks with growing subsets
+			// would repeat all earlier steps redundantly.
 			gpus, err := apiListNvidiaGPUs(h.opts.App)
 			if err != nil {
 				writeError(w, http.StatusBadRequest, err.Error())
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 			} else {
 				now := time.Now()
 				rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
-				var allTasks []*Task
-				for step := 1; step <= len(resolved); step++ {
-					subset := resolved[:step]
-					stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
-					t := &Task{
-						ID:        newJobID("bee-bench-nvidia"),
-						Name:      stepName,
-						Target:    target,
-						Priority:  defaultTaskPriority(target, taskParams{}),
-						Status:    TaskPending,
-						CreatedAt: now,
-						params: taskParams{
-							GPUIndices:       append([]int(nil), subset...),
-							SizeMB:           body.SizeMB,
-							BenchmarkProfile: body.Profile,
-							RunNCCL:          runNCCL && step == len(resolved),
-							ParallelGPUs:     true,
-							RampStep:         step,
-							RampTotal:        len(resolved),
-							RampRunID:        rampRunID,
-							DisplayName:      stepName,
-						},
-					}
-					allTasks = append(allTasks, t)
+				taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
+				t := &Task{
+					ID:        newJobID("bee-bench-nvidia"),
+					Name:      taskName,
+					Target:    target,
+					Priority:  defaultTaskPriority(target, taskParams{}),
+					Status:    TaskPending,
+					CreatedAt: now,
+					params: taskParams{
+						GPUIndices:       append([]int(nil), resolved...),
+						SizeMB:           body.SizeMB,
+						BenchmarkProfile: body.Profile,
+						RunNCCL:          runNCCL,
+						ParallelGPUs:     true,
+						RampTotal:        len(resolved),
+						RampRunID:        rampRunID,
+						DisplayName:      taskName,
+					},
 				}
-				for _, t := range allTasks {
-					globalQueue.enqueue(t)
-				}
-				writeTaskRunResponse(w, allTasks)
+				globalQueue.enqueue(t)
+				writeTaskRunResponse(w, []*Task{t})
 				return
 			}
 		}
@@ -1529,6 +1523,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
 	writeJSON(w, map[string]string{"status": "rolled back"})
 }

+func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
+}
+
 func (h *handler) rollbackPendingNetworkChange() error {
 	h.pendingNetMu.Lock()
 	pnc := h.pendingNet
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -2002,7 +2002,7 @@ func renderBenchmark(opts HandlerOptions) string {
  </div>
 </div>

-` + renderBenchmarkResultsCard(opts.ExportDir) + `
+`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`

 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2188,7 +2188,9 @@ function runNvidiaBenchmark(kind) {
        if (e.data) failures += 1;
        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
        term.scrollTop = term.scrollHeight;
+        const isLast = (idx + 1 >= taskIds.length);
        streamNext(idx + 1, failures);
+        if (isLast) { benchmarkRefreshResults(); }
      });
      benchmarkES.onerror = function() {
        if (benchmarkES) {
@@ -2208,18 +2210,30 @@ function runNvidiaBenchmark(kind) {
 }

 benchmarkLoadGPUs();
+
+function benchmarkRefreshResults() {
+  fetch('/api/benchmark/results')
+    .then(function(r) { return r.text(); })
+    .then(function(html) {
+      const el = document.getElementById('benchmark-results-section');
+      if (el) el.innerHTML = html;
+    })
+    .catch(function() {});
+}
 </script>`
 }

 func renderBenchmarkResultsCard(exportDir string) string {
 	maxIdx, runs := loadBenchmarkHistory(exportDir)
-	return renderBenchmarkResultsCardFromRuns(
-		"Perf Results",
+	perf := renderBenchmarkResultsCardFromRuns(
+		"Performance Results",
 		"Composite score by saved benchmark run and GPU.",
-		"No saved benchmark runs yet.",
+		"No saved performance benchmark runs yet.",
 		maxIdx,
 		runs,
 	)
+	power := renderPowerBenchmarkResultsCard(exportDir)
+	return perf + "\n" + power
 }

 func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
@@ -2299,6 +2313,126 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
 	return maxGPUIndex, runs
 }

+func renderPowerBenchmarkResultsCard(exportDir string) string {
+	baseDir := app.DefaultBeeBenchPowerDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-bench", "power")
+	}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
+	}
+	sort.Strings(paths)
+
+	type powerRun struct {
+		generatedAt time.Time
+		displayTime string
+		result      platform.NvidiaPowerBenchResult
+	}
+	var runs []powerRun
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var r platform.NvidiaPowerBenchResult
+		if err := json.Unmarshal(raw, &r); err != nil {
+			continue
+		}
+		runs = append(runs, powerRun{
+			generatedAt: r.GeneratedAt,
+			displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			result:      r,
+		})
+	}
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+
+	// Show only the most recent run's GPU slot table, plus a run history summary.
+	var b strings.Builder
+	b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
+
+	latest := runs[0].result
+	b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
+	if latest.Hostname != "" {
+		b.WriteString(` — ` + html.EscapeString(latest.Hostname))
+	}
+	if latest.OverallStatus != "" {
+		statusColor := "var(--ok)"
+		if latest.OverallStatus != "OK" {
+			statusColor = "var(--warn)"
+		}
+		b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
+	}
+	b.WriteString(`</p>`)
+
+	if len(latest.GPUs) > 0 {
+		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`</tr></thead><tbody>`)
+		for _, gpu := range latest.GPUs {
+			derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
+			rowStyle := ""
+			achievedStyle := ""
+			if derated {
+				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
+				achievedStyle = ` style="color:#e6a000;font-weight:600"`
+			}
+			statusLabel := gpu.Status
+			if statusLabel == "" {
+				statusLabel = "OK"
+			}
+			statusColor := "var(--ok)"
+			if statusLabel != "OK" {
+				statusColor = "var(--warn)"
+			}
+			nominalStr := "-"
+			if gpu.DefaultPowerLimitW > 0 {
+				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
+			}
+			achievedStr := "-"
+			if gpu.AppliedPowerLimitW > 0 {
+				achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+			}
+			p95Str := "-"
+			if gpu.MaxObservedPowerW > 0 {
+				p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
+			}
+			b.WriteString(`<tr` + rowStyle + `>`)
+			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
+			b.WriteString(`<td>` + nominalStr + `</td>`)
+			b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
+			b.WriteString(`<td>` + p95Str + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div>`)
+	}
+
+	if len(runs) > 1 {
+		b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
+		b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
+		for i, run := range runs {
+			statusColor := "var(--ok)"
+			if run.result.OverallStatus != "OK" {
+				statusColor = "var(--warn)"
+			}
+			b.WriteString(`<tr>`)
+			b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+			b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div></details>`)
+	}
+
+	b.WriteString(`</div></div>`)
+	return b.String()
+}
+
 // ── Burn ──────────────────────────────────────────────────────────────────────

 func renderBurn() string {
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -263,6 +263,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
+	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)

 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -1,6 +1,7 @@
 DEBIAN_VERSION=12
 DEBIAN_KERNEL_ABI=auto
 NVIDIA_DRIVER_VERSION=590.48.01
+NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
 NCCL_VERSION=2.28.9-1
 NCCL_CUDA_VERSION=13.0
 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
 COMGR_VERSION=2.8.0.60304-76~22.04
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
+MEMTEST_VERSION=6.10-4
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -23,9 +23,9 @@ lb config noauto \
    --bootloaders "grub-efi,syslinux" \
    --debian-installer none \
    --archive-areas "main contrib non-free non-free-firmware" \
-    --mirror-bootstrap "https://deb.debian.org/debian" \
-    --mirror-chroot "https://deb.debian.org/debian" \
-    --mirror-binary "https://deb.debian.org/debian" \
+    --mirror-bootstrap "http://mirror.mephi.ru/debian/" \
+    --mirror-chroot "http://mirror.mephi.ru/debian/" \
+    --mirror-binary "http://mirror.mephi.ru/debian/" \
    --security true \
    --linux-flavours "amd64" \
    --linux-packages "${LB_LINUX_PACKAGES}" \
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -161,6 +161,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
+            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}" \
@@ -175,6 +176,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
+            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}"
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
 export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT

 . "${BUILDER_DIR}/VERSIONS"
+export MEMTEST_VERSION
 export PATH="$PATH:/usr/local/go/bin"
 : "${BEE_REQUIRE_MEMTEST:=0}"

@@ -775,6 +776,7 @@ run_optional_step_sh() {
        return 0
    fi

+    mkdir -p "${LOG_DIR}" 2>/dev/null || true
    step_log="${LOG_DIR}/${step_slug}.log"
    echo ""
    echo "=== optional step: ${step_name} ==="
@@ -798,13 +800,14 @@ start_build_log
 # install them on the fly so NVIDIA modules and ISO kernel always match.
 if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
    echo "=== refreshing apt index to detect current kernel ABI ==="
-    apt-get update -qq
+    apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
    DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
        | awk '/Depends:.*linux-image-[0-9]/{print $2}' \
        | grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
        | head -1)
    if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
        echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
+        echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
        exit 1
    fi
    echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
@@ -1259,6 +1262,7 @@ fi
 # --- substitute version placeholders in package list and archive ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    sed -i \
+        -e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
        -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
        "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
 # Enable GPU-vendor specific services
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    systemctl enable nvidia-dcgm.service 2>/dev/null || true
+    systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
    systemctl enable bee-nvidia.service
 elif [ "$GPU_VENDOR" = "amd" ]; then
    # ROCm symlinks (packages install to /opt/rocm-*/bin/)
--- a/iso/builder/config/hooks/normal/9100-memtest.hook.binary
+++ b/iso/builder/config/hooks/normal/9100-memtest.hook.binary
@@ -5,6 +5,8 @@ set -e

 : "${BEE_REQUIRE_MEMTEST:=0}"

+# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
+# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
 MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
 BINARY_BOOT_DIR="binary/boot"
 GRUB_CFG="binary/boot/grub/grub.cfg"
@@ -24,15 +26,23 @@ fail_or_warn() {
    return 0
 }

+# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
+# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
+# The template already has memtest entries hardcoded, so a missing config file
+# here is not an error; validate_iso_memtest() checks the final ISO instead.
+warn_only() {
+    log "WARNING: $1"
+}
+
 copy_memtest_file() {
    src="$1"
-    base="$(basename "$src")"
-    dst="${BINARY_BOOT_DIR}/${base}"
+    dst_name="${2:-$(basename "$src")}"
+    dst="${BINARY_BOOT_DIR}/${dst_name}"

    [ -f "$src" ] || return 1
    mkdir -p "${BINARY_BOOT_DIR}"
    cp "$src" "$dst"
-    log "copied ${base} from ${src}"
+    log "copied ${dst_name} from ${src}"
 }

 extract_memtest_from_deb() {
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {

    log "extracting memtest payload from ${deb}"
    dpkg-deb -x "$deb" "$tmpdir"
-    for f in ${MEMTEST_FILES}; do
-        if [ -f "${tmpdir}/boot/${f}" ]; then
-            copy_memtest_file "${tmpdir}/boot/${f}"
-        fi
-    done
+
+    # EFI binary: both 5.x and 6.x use memtest86+x64.efi
+    if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
+    fi
+
+    # BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
+    if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
+    elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
+    fi
+
    rm -rf "$tmpdir"
 }

+download_and_extract_memtest() {
+    tmpdl="$(mktemp -d)"
+    if [ -n "${MEMTEST_VERSION:-}" ]; then
+        pkg_spec="memtest86+=${MEMTEST_VERSION}"
+    else
+        pkg_spec="memtest86+"
+    fi
+    log "downloading ${pkg_spec} from apt"
+    if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
+        log "apt download failed, retrying after apt-get update"
+        apt-get update -qq >/dev/null 2>&1 || true
+        ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
+    fi
+    deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
+    if [ -n "$deb" ]; then
+        extract_memtest_from_deb "$deb"
+    else
+        log "apt download of memtest86+ failed"
+    fi
+    rm -rf "$tmpdl"
+}
+
 ensure_memtest_binaries() {
    missing=0
    for f in ${MEMTEST_FILES}; do
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0

+    # 1. Try files already placed by lb binary_memtest or chroot
    for root in chroot/boot /boot; do
        for f in ${MEMTEST_FILES}; do
            [ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
        done
+        # 6.x BIOS binary may lack x64 in name — copy with normalised name
+        if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
+            copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
+        fi
    done

    missing=0
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0

+    # 2. Try apt package cache (may be empty if lb binary_memtest already purged)
    for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
        [ -d "$root" ] || continue
        deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
        break
    done

+    missing=0
+    for f in ${MEMTEST_FILES}; do
+        [ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
+    done
+    [ "$missing" -eq 1 ] || return 0
+
+    # 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
+    download_and_extract_memtest
+
    missing=0
    for f in ${MEMTEST_FILES}; do
        if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {

 ensure_grub_entry() {
    [ -f "$GRUB_CFG" ] || {
-        fail_or_warn "missing ${GRUB_CFG}"
+        warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
        return 0
    }

@@ -114,7 +169,7 @@ EOF

 ensure_isolinux_entry() {
    [ -f "$ISOLINUX_CFG" ] || {
-        fail_or_warn "missing ${ISOLINUX_CFG}"
+        warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
        return 0
    }

--- a/iso/builder/config/package-lists/bee-nvidia.list.chroot
+++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot
@@ -5,6 +5,7 @@
 # DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
 # CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
 # explicitly.
+nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
 datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -258,6 +258,22 @@ else
    log "WARN: nvidia-smi not found — cannot enable persistence mode"
 fi

+# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
+# systems CUDA/DCGM can report "system not yet initialized" until fabric
+# training completes under nvidia-fabricmanager.
+if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
+    if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
+        log "nvidia-fabricmanager restarted"
+    elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
+        log "nvidia-fabricmanager started"
+    else
+        log "WARN: failed to start nvidia-fabricmanager.service"
+        systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/  fabricmanager: /' || true
+    fi
+else
+    log "WARN: nvidia-fabricmanager.service not installed"
+fi
+
 # Start DCGM host engine so dcgmi can discover GPUs.
 # nv-hostengine must run after the NVIDIA modules and device nodes are ready.
 # If it started too early (for example via systemd before bee-nvidia-load), it can
Author	SHA1	Message	Date
Mikhail Chusavitin	04eb4b5a6d	Revert "Pre-download DCGM/fabricmanager debs on host to bypass chroot apt" This reverts commit `4110dbf8a6`.	2026-04-15 17:19:53 +03:00
Mikhail Chusavitin	4110dbf8a6	Pre-download DCGM/fabricmanager debs on host to bypass chroot apt The NVIDIA CUDA HTTPS apt source (developer.download.nvidia.com) may be unreachable from inside the live-build container chroot, causing 'E: Unable to locate package datacenter-gpu-manager-4-cuda13'. Add build-dcgm.sh that downloads DCGM and nvidia-fabricmanager .deb packages on the build host (verifying SHA256 against Packages.gz) and caches them in BEE_CACHE_DIR. build.sh (step 25-dcgm, nvidia only) copies them into LB_DIR/config/packages.chroot/ before lb build, so live-build creates a local apt repo from them. The chroot installs the packages from the local repo without ever contacting the NVIDIA CUDA HTTPS source. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 17:10:23 +03:00
Mikhail Chusavitin	7237e4d3e4	Add fabric manager boot and support diagnostics	2026-04-15 16:14:26 +03:00
Mikhail Chusavitin	ab3ad77cd6	Fix Go module: upgrade modernc.org/libc v1.70.0 → v1.72.0 modernc.org/sqlite v1.48.0 requires modernc.org/libc/sys/types which is absent in v1.70.0 but present in v1.72.0. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 14:32:04 +03:00
Mikhail Chusavitin	cd9e2cbe13	Fix ramp-up power bench: one task instead of N redundant tasks RunNvidiaPowerBench already performs a full internal ramp from 1 to N GPUs in Phase 2. Spawning N tasks with growing GPU subsets meant task K repeated all steps 1..K-1 already done by tasks 1..K-1 — O(N²) work instead of O(N). Replace with a single task using all selected GPUs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 12:29:11 +03:00
Mikhail Chusavitin	0317dc58fd	Fix memtest hook: grub.cfg/live.cfg missing during binary hooks is expected lb binary_grub-efi and lb binary_syslinux create these files from templates that already have memtest entries hardcoded. The hook should not fail when the files don't exist yet — validate_iso_memtest() checks the final ISO. Only the binary files (x64.bin, x64.efi) are required here. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 10:33:22 +03:00
Mikhail Chusavitin	1c5cb45698	Fix memtest hook: bad ver_arg format in apt-get download ver_arg was set to "=memtest86+=VERSION" making the command "apt-get download memtest86+=memtest86+=VERSION" (invalid). Fixed to build pkg_spec directly as "memtest86+=VERSION". Also add apt-get update retry if initial download fails. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 10:15:01 +03:00
Mikhail Chusavitin	090b92ca73	Re-enable security repo: kernel 6.1.0-44 is in bookworm-security only Disabling --security broke the build because linux-image-6.1.0-44-amd64 is a security update not present in the base bookworm repo. Main packages already come from mirror.mephi.ru. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 10:02:52 +03:00
Mikhail Chusavitin	2dccbc010c	Use MEPHI mirror, disable security repo, fix memtest in ISO build - Switch all lb mirrors to mirror.mephi.ru/debian/ for faster/reliable downloads - Disable security repo (--security false) — not needed for LiveCD - Pin MEMTEST_VERSION=6.10-4 in VERSIONS, export to hook environment - Set BEE_REQUIRE_MEMTEST=1 in build-in-container.sh — missing memtest is now fatal - Fix 9100-memtest.hook.binary: add apt-get download fallback when lb binary_memtest has already purged the package cache; handle both 5.x (memtest86+x64.bin) and 6.x (memtest86+.bin) BIOS binary naming Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 09:57:29 +03:00
Michael Chus	e84c69d360	Fix optional step log dir missing after memtest recovery mkdir -p LOG_DIR before writing the optional step log so that a race with cleanup_build_log (EXIT trap archiving the log dir) does not cause a "Directory nonexistent" error during lb binary_checksums / lb binary_iso. Also downgrade apt-get update failure to a warning so a transient mirror outage does not block kernel ABI auto-detection when the apt cache is warm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 07:28:36 +03:00
Michael Chus	c80a39e7ac	Add power results table, fix benchmark results refresh, bound memtester - Benchmark page now shows two result sections: Performance (scores) and Power / Thermal Fit (slot table). After any benchmark task completes the results section auto-refreshes via GET /api/benchmark/results without a full page reload. - Power results table shows each GPU slot with nominal TDP, achieved stable power limit, and P95 observed power. Rows with derated cards are highlighted amber so under-performing slots stand out at a glance. Older runs are collapsed in a <details> summary. - memtester is now wrapped with timeout(1) so a stuck memory controller cannot cause Validate Memory to hang indefinitely. Wall-clock limit is ~2.5 min per 100 MB per pass plus a 2-minute buffer. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 07:16:18 +03:00
Michael Chus	a5e0261ff2	Refactor power ramp to use true single-card baselines Phase 1 now calibrates each GPU individually (sequentially) so that PowerRealizationPct reflects real degradation from neighbour thermals and shared power rails. Previously the baseline came from an all-GPU-together run, making realization always ≈100% at the final ramp step. Ramp step 1 reuses single-card calibration results (no extra run); steps 2..N run targeted_power on the growing GPU subset with derating active. Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper — they were compensation for the old all-GPU calibration approach. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 23:47:57 +03:00
Michael Chus	ee422ede3c	Revert "Add raster Easy Bee branding assets" This reverts commit `d560b2fead`.	2026-04-14 23:00:15 +03:00
Michael Chus	d560b2fead	Add raster Easy Bee branding assets	2026-04-14 22:39:25 +03:00
Michael Chus	3cf2e9c9dc	Run power calibration for all GPUs simultaneously Previously each GPU was calibrated sequentially (one card fully done before the next started), producing the staircase temperature pattern seen on the graph. Now all GPUs run together in a single dcgmi diag -r targeted_power session per attempt. This means: - All cards are under realistic thermal load at the same time. - A single DCGM session handles the run — no resource-busy contention from concurrent dcgmi processes. - Binary search state (lo/hi) is tracked independently per GPU; each card converges to its own highest stable power limit. - Throttle counter polling covers all active GPUs in the shared ticker. - Resource-busy exponential back-off is shared (one DCGM session). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:25:05 +03:00
Michael Chus	19dbabd71d	Simplify power calibration: pure binary search, no telemetry guessing Remove telemetry-guided initial candidate; use strict binary search midpoint at every step. Clean and predictable convergence in O(log N) attempts within the allowed power range [minLimitW, startingLimitW]. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:12:45 +03:00