Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d |
@@ -5,22 +5,18 @@ go 1.25.0
|
||||
replace reanimator/chart => ../internal/chart
|
||||
|
||||
require (
|
||||
github.com/go-analyze/charts v0.5.26
|
||||
modernc.org/sqlite v1.48.0
|
||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/image v0.24.0 // indirect
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
modernc.org/libc v1.70.0 // indirect
|
||||
modernc.org/libc v1.72.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
modernc.org/sqlite v1.48.0 // indirect
|
||||
)
|
||||
|
||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||
|
||||
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
|
||||
var supportBundleCommands = []struct {
|
||||
@@ -48,6 +50,43 @@ else
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi topo -m 2>&1 || true
|
||||
else
|
||||
echo "nvidia-smi not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||
`}},
|
||||
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||
for candidate in \
|
||||
/usr/bin/nvidia-fabricmanager \
|
||||
/usr/bin/nv-fabricmanager \
|
||||
/usr/bin/nvidia-fabricmanagerd \
|
||||
/usr/bin/nvlsm; do
|
||||
if [ -e "$candidate" ]; then
|
||||
echo "=== $candidate ==="
|
||||
ls -l "$candidate" 2>&1 || true
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||
echo "no fabric manager binaries found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
|
||||
@@ -2476,9 +2476,6 @@ func runBenchmarkPowerCalibration(
|
||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||
const calibSearchTolerance = 10
|
||||
// calibPreThrottleMarginW is subtracted from the telemetry-estimated
|
||||
// pre-throttle power draw to produce a smarter initial search candidate.
|
||||
const calibPreThrottleMarginW = 10
|
||||
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
||||
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
||||
// doubling each retry until it would exceed the cap, at which point the
|
||||
@@ -2501,8 +2498,25 @@ func runBenchmarkPowerCalibration(
|
||||
err error
|
||||
}
|
||||
|
||||
|
||||
// gpuCalibState holds per-GPU binary search state during parallel calibration.
|
||||
type gpuCalibState struct {
|
||||
idx int
|
||||
info benchmarkGPUInfo
|
||||
originalLimitW int
|
||||
appliedLimitW int
|
||||
minLimitW int
|
||||
lo int // highest verified-stable limit (assumed: minLimitW)
|
||||
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
||||
calib benchmarkPowerCalibrationResult
|
||||
converged bool
|
||||
}
|
||||
|
||||
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
||||
var restore []benchmarkRestoreAction
|
||||
|
||||
// Initialise per-GPU state.
|
||||
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
||||
for _, idx := range gpuIndices {
|
||||
info := infoByIndex[idx]
|
||||
originalLimitW := int(math.Round(info.PowerLimitW))
|
||||
@@ -2531,17 +2545,17 @@ func runBenchmarkPowerCalibration(
|
||||
if minLimitW < calibSearchTolerance {
|
||||
minLimitW = calibSearchTolerance
|
||||
}
|
||||
|
||||
calib := benchmarkPowerCalibrationResult{
|
||||
AppliedPowerLimitW: float64(appliedLimitW),
|
||||
s := &gpuCalibState{
|
||||
idx: idx,
|
||||
info: info,
|
||||
originalLimitW: originalLimitW,
|
||||
appliedLimitW: appliedLimitW,
|
||||
minLimitW: minLimitW,
|
||||
lo: minLimitW,
|
||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||
}
|
||||
// Binary search bounds for finding the highest stable power limit.
|
||||
// lo = highest verified-stable level (assumed: minLimitW).
|
||||
// hi = lowest verified-unstable level (assumed: above the starting limit).
|
||||
lo := minLimitW
|
||||
hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
|
||||
busyRetries := 0
|
||||
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
|
||||
states = append(states, s)
|
||||
if canDerate && originalLimitW > 0 {
|
||||
idxCopy := idx
|
||||
orig := originalLimitW
|
||||
@@ -2552,212 +2566,243 @@ func runBenchmarkPowerCalibration(
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
calibLoop:
|
||||
// Shared DCGM resource-busy back-off state (single diagnostic session).
|
||||
busyRetries := 0
|
||||
busyDelaySec := 1
|
||||
sharedAttempt := 0
|
||||
|
||||
type sharedAttemptResult struct {
|
||||
out []byte
|
||||
rows []GPUMetricRow
|
||||
err error
|
||||
}
|
||||
|
||||
calibDone:
|
||||
for {
|
||||
// Collect non-converged GPUs.
|
||||
var active []*gpuCalibState
|
||||
for _, s := range states {
|
||||
if !s.converged {
|
||||
active = append(active, s)
|
||||
}
|
||||
}
|
||||
if len(active) == 0 || ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
|
||||
sharedAttempt++
|
||||
for _, s := range active {
|
||||
s.calib.Attempts++
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
||||
}
|
||||
|
||||
// Snapshot throttle counters for all active GPUs before the run.
|
||||
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
|
||||
for _, s := range active {
|
||||
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
||||
}
|
||||
|
||||
// Run targeted_power for ALL gpuIndices simultaneously so every card
|
||||
// is under load during calibration — this reflects real server thermals.
|
||||
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
||||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||||
doneCh := make(chan sharedAttemptResult, 1)
|
||||
go func() {
|
||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
|
||||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||||
}()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
throttleReasons := make(map[int]string, len(active))
|
||||
var ar sharedAttemptResult
|
||||
|
||||
attemptLoop:
|
||||
for {
|
||||
calib.Attempts++
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
|
||||
|
||||
beforeThrottle, _ := queryThrottleCounters(idx)
|
||||
attemptCtx, cancel := context.WithCancel(ctx)
|
||||
doneCh := make(chan calibrationAttemptResult, 1)
|
||||
logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
|
||||
go func() {
|
||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
|
||||
doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
|
||||
}()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
var (
|
||||
attempt calibrationAttemptResult
|
||||
throttleReason string
|
||||
)
|
||||
attemptLoop:
|
||||
for {
|
||||
select {
|
||||
case attempt = <-doneCh:
|
||||
break attemptLoop
|
||||
case <-ticker.C:
|
||||
afterThrottle, err := queryThrottleCounters(idx)
|
||||
select {
|
||||
case ar = <-doneCh:
|
||||
break attemptLoop
|
||||
case <-ticker.C:
|
||||
// Poll throttle counters for each active GPU independently.
|
||||
for _, s := range active {
|
||||
if throttleReasons[s.idx] != "" {
|
||||
continue // already detected for this GPU
|
||||
}
|
||||
after, err := queryThrottleCounters(s.idx)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// Record the throttle reason but do NOT cancel the dcgmi
|
||||
// process. Killing it mid-run leaves nv-hostengine holding
|
||||
// the diagnostic slot, which causes DCGM_ST_IN_USE on every
|
||||
// subsequent attempt. Let targeted_power run to its natural
|
||||
// end so the daemon releases the slot cleanly before we
|
||||
// reduce power and retry.
|
||||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
|
||||
throttleReason = reason
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
|
||||
// Record throttle but do NOT cancel — let dcgmi finish so
|
||||
// nv-hostengine releases the slot cleanly before the next attempt.
|
||||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
||||
throttleReasons[s.idx] = reason
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
||||
}
|
||||
case <-ctx.Done():
|
||||
cancel()
|
||||
attempt = <-doneCh
|
||||
break attemptLoop
|
||||
}
|
||||
case <-ctx.Done():
|
||||
cancelAttempt()
|
||||
ar = <-doneCh
|
||||
break attemptLoop
|
||||
}
|
||||
ticker.Stop()
|
||||
cancel()
|
||||
_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
|
||||
}
|
||||
ticker.Stop()
|
||||
cancelAttempt()
|
||||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||||
|
||||
perGPU := filterRowsByGPU(attempt.rows, idx)
|
||||
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
||||
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
||||
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
|
||||
for _, s := range active {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
|
||||
s.converged = true
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
|
||||
break calibDone
|
||||
}
|
||||
busyRetries++
|
||||
// Undo attempt counter: busy retries don't count as real attempts.
|
||||
for _, s := range active {
|
||||
s.calib.Attempts--
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break calibDone
|
||||
case <-time.After(time.Duration(busyDelaySec) * time.Second):
|
||||
}
|
||||
next := busyDelaySec * 2
|
||||
if next > dcgmResourceBusyMaxDelaySec {
|
||||
next = dcgmResourceBusyMaxDelaySec + 1
|
||||
}
|
||||
busyDelaySec = next
|
||||
sharedAttempt-- // retry same logical attempt number
|
||||
continue
|
||||
}
|
||||
busyRetries = 0
|
||||
busyDelaySec = 1
|
||||
|
||||
// Per-GPU analysis and binary search update.
|
||||
for _, s := range active {
|
||||
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
||||
summary := summarizeBenchmarkTelemetry(perGPU)
|
||||
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
|
||||
// Stable at appliedLimitW: record it and binary-search upward.
|
||||
calib.Summary = summary
|
||||
calib.Completed = true
|
||||
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||
lo = appliedLimitW
|
||||
// If there is still headroom to search, try a higher level.
|
||||
if canDerate && hi-lo > calibSearchTolerance {
|
||||
nextLimitW := roundTo5W((lo + hi) / 2)
|
||||
if nextLimitW > lo && nextLimitW < hi {
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
|
||||
appliedLimitW = nextLimitW
|
||||
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
|
||||
continue calibLoop
|
||||
throttle := throttleReasons[s.idx]
|
||||
|
||||
// Cooling warning: thermal throttle with fans not at maximum.
|
||||
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
||||
clocks := make([]float64, 0, len(perGPU))
|
||||
var fanDutyValues []float64
|
||||
fanDutyAvail := false
|
||||
for _, r := range perGPU {
|
||||
if r.ClockMHz > 0 {
|
||||
clocks = append(clocks, r.ClockMHz)
|
||||
}
|
||||
if r.FanDutyCycleAvailable {
|
||||
fanDutyAvail = true
|
||||
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
|
||||
}
|
||||
}
|
||||
dropPct := benchmarkClockDrift(clocks)
|
||||
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
|
||||
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
|
||||
s.calib.CoolingWarning = fmt.Sprintf(
|
||||
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
|
||||
throttle, dropPct, p95FanDuty,
|
||||
)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
|
||||
}
|
||||
}
|
||||
|
||||
if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
|
||||
// Stable at current limit — update lo and binary-search upward.
|
||||
s.calib.Summary = summary
|
||||
s.calib.Completed = true
|
||||
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||
s.lo = s.appliedLimitW
|
||||
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
||||
next := roundTo5W((s.lo + s.hi) / 2)
|
||||
if next > s.lo && next < s.hi {
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
|
||||
s.appliedLimitW = next
|
||||
s.calib.AppliedPowerLimitW = float64(next)
|
||||
s.calib.Completed = false // keep searching
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
|
||||
continue // next GPU in active list
|
||||
}
|
||||
}
|
||||
}
|
||||
break
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
|
||||
// If DCGM reports the resource is in use, nv-hostengine has not yet
|
||||
// released the diagnostic slot from the previous attempt. Do not
|
||||
// derate: wait with exponential back-off and retry at the same
|
||||
// power limit. Once the back-off delay would exceed
|
||||
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
|
||||
// held by something else.
|
||||
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
|
||||
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
|
||||
break
|
||||
}
|
||||
busyRetries++
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break calibLoop
|
||||
case <-time.After(time.Duration(busyDelaySec) * time.Second):
|
||||
}
|
||||
next := busyDelaySec * 2
|
||||
if next > dcgmResourceBusyMaxDelaySec {
|
||||
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
|
||||
}
|
||||
busyDelaySec = next
|
||||
continue calibLoop
|
||||
}
|
||||
busyRetries = 0 // reset on any non-busy outcome
|
||||
busyDelaySec = 1 // reset back-off
|
||||
|
||||
// Failed or throttled — log and binary-search downward.
|
||||
switch {
|
||||
case throttleReason != "":
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
|
||||
// Check whether the thermal throttle coincided with fans below
|
||||
// maximum: that combination suggests cooling misconfiguration
|
||||
// rather than a fundamental power-delivery limit.
|
||||
if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
|
||||
clocks := make([]float64, 0, len(perGPU))
|
||||
var fanDutyValues []float64
|
||||
fanDutyAvail := false
|
||||
for _, r := range perGPU {
|
||||
if r.ClockMHz > 0 {
|
||||
clocks = append(clocks, r.ClockMHz)
|
||||
}
|
||||
if r.FanDutyCycleAvailable {
|
||||
fanDutyAvail = true
|
||||
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
|
||||
}
|
||||
}
|
||||
dropPct := benchmarkClockDrift(clocks)
|
||||
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
|
||||
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
|
||||
calib.CoolingWarning = fmt.Sprintf(
|
||||
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
|
||||
throttleReason, dropPct, p95FanDuty,
|
||||
)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
|
||||
}
|
||||
}
|
||||
case attempt.err != nil:
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
|
||||
case throttle != "":
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
||||
case ar.err != nil:
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
|
||||
default:
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
||||
}
|
||||
|
||||
if !canDerate || appliedLimitW <= 0 {
|
||||
break
|
||||
if !canDerate || s.appliedLimitW <= 0 {
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
// Binary-search for the highest stable power limit.
|
||||
// This attempt failed or throttled, so update the upper bound.
|
||||
hi = appliedLimitW
|
||||
s.hi = s.appliedLimitW
|
||||
|
||||
if hi-lo <= calibSearchTolerance {
|
||||
// Search range exhausted: lo is the highest verified-stable level.
|
||||
if lo > minLimitW {
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
|
||||
appliedLimitW = lo
|
||||
calib.AppliedPowerLimitW = float64(lo)
|
||||
calib.Derated = lo < originalLimitW
|
||||
if s.hi-s.lo <= calibSearchTolerance {
|
||||
if s.lo > s.minLimitW {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
||||
s.appliedLimitW = s.lo
|
||||
s.calib.AppliedPowerLimitW = float64(s.lo)
|
||||
s.calib.Derated = s.lo < s.originalLimitW
|
||||
}
|
||||
} else {
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
}
|
||||
break
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
|
||||
// Compute the next candidate.
|
||||
// For thermal throttle: use the pre-throttle power draw from telemetry
|
||||
// as a smarter initial estimate instead of the binary midpoint — it
|
||||
// lands much closer to the true limit on the first attempt.
|
||||
nextLimitW := (lo + hi) / 2
|
||||
if strings.Contains(throttleReason, "thermal") {
|
||||
if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 {
|
||||
candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW)
|
||||
if candidate > lo && candidate < hi {
|
||||
nextLimitW = candidate
|
||||
}
|
||||
}
|
||||
next := roundTo5W((s.lo + s.hi) / 2)
|
||||
if next <= s.lo {
|
||||
next = s.lo + calibSearchTolerance
|
||||
}
|
||||
nextLimitW = roundTo5W(nextLimitW)
|
||||
// Ensure the candidate is strictly inside the search range.
|
||||
if nextLimitW <= lo {
|
||||
nextLimitW = lo + calibSearchTolerance
|
||||
if next >= s.hi {
|
||||
next = (s.lo + s.hi) / 2
|
||||
}
|
||||
if nextLimitW >= hi {
|
||||
nextLimitW = (lo + hi) / 2
|
||||
if next < s.minLimitW {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
if nextLimitW < minLimitW {
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
break
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
|
||||
calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
|
||||
break
|
||||
}
|
||||
appliedLimitW = nextLimitW
|
||||
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||
calib.Derated = appliedLimitW < originalLimitW
|
||||
info.PowerLimitW = float64(appliedLimitW)
|
||||
infoByIndex[idx] = info
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
|
||||
s.appliedLimitW = next
|
||||
s.calib.AppliedPowerLimitW = float64(next)
|
||||
s.calib.Derated = next < s.originalLimitW
|
||||
s.info.PowerLimitW = float64(next)
|
||||
infoByIndex[s.idx] = s.info
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
||||
}
|
||||
}
|
||||
|
||||
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
|
||||
results[idx] = calib
|
||||
for _, s := range states {
|
||||
if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
|
||||
results[s.idx] = s.calib
|
||||
}
|
||||
}
|
||||
return results, restore
|
||||
@@ -2770,28 +2815,6 @@ func isDCGMResourceBusy(err error) bool {
|
||||
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
||||
}
|
||||
|
||||
// calibPreThrottlePowerW estimates the GPU power draw just before thermal
|
||||
// throttle onset by averaging the first quarter of telemetry rows. The early
|
||||
// samples capture the GPU at peak before clock/power reduction kicks in.
|
||||
func calibPreThrottlePowerW(rows []GPUMetricRow) float64 {
|
||||
if len(rows) < 4 {
|
||||
return 0
|
||||
}
|
||||
n := len(rows) / 4
|
||||
var sum float64
|
||||
var cnt int
|
||||
for _, r := range rows[:n] {
|
||||
if r.PowerW > 0 {
|
||||
sum += r.PowerW
|
||||
cnt++
|
||||
}
|
||||
}
|
||||
if cnt == 0 {
|
||||
return 0
|
||||
}
|
||||
return sum / float64(cnt)
|
||||
}
|
||||
|
||||
// roundTo5W rounds w to the nearest 5 W boundary.
|
||||
func roundTo5W(w int) int {
|
||||
return ((w + 2) / 5) * 5
|
||||
@@ -2808,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
|
||||
}
|
||||
}
|
||||
|
||||
func occupiedSlots(indices []int, current int) []int {
|
||||
out := make([]int, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
if idx != current {
|
||||
out = append(out, idx)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
||||
out := make(map[int]benchmarkGPUInfo, len(src))
|
||||
@@ -2864,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
b.WriteString("\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||
if gpu.OccupiedSlotsNote != "" {
|
||||
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
|
||||
}
|
||||
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
@@ -2932,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
_ = durationSec
|
||||
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
|
||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||
var allRestoreActions []benchmarkRestoreAction
|
||||
for _, idx := range selected {
|
||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
|
||||
allRestoreActions = append(allRestoreActions, restore...)
|
||||
if r, ok := c[idx]; ok {
|
||||
calibByIndex[idx] = r
|
||||
}
|
||||
}
|
||||
defer func() {
|
||||
for i := len(restoreActions) - 1; i >= 0; i-- {
|
||||
restoreActions[i].fn()
|
||||
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
||||
allRestoreActions[i].fn()
|
||||
}
|
||||
}()
|
||||
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
|
||||
@@ -2952,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.OverallStatus = "PARTIAL"
|
||||
}
|
||||
}
|
||||
occupied := occupiedSlots(selected, idx)
|
||||
note := ""
|
||||
if len(occupied) > 0 {
|
||||
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
|
||||
}
|
||||
gpus = append(gpus, NvidiaPowerBenchGPU{
|
||||
Index: idx,
|
||||
Name: info.Name,
|
||||
@@ -2968,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
CalibrationAttempts: calib.Attempts,
|
||||
Derated: calib.Derated,
|
||||
Status: status,
|
||||
OccupiedSlots: occupied,
|
||||
OccupiedSlotsNote: note,
|
||||
Notes: append([]string(nil), calib.Notes...),
|
||||
CoolingWarning: calib.CoolingWarning,
|
||||
})
|
||||
@@ -3009,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
for _, gpu := range gpus {
|
||||
singleByIndex[gpu.Index] = gpu
|
||||
}
|
||||
|
||||
// Phase 2: ramp — add one GPU per step and calibrate the growing subset
|
||||
// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
|
||||
// targeted_power with derating if degradation is detected.
|
||||
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
|
||||
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
||||
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
||||
_ = os.MkdirAll(stepDir, 0755)
|
||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
||||
for i := len(stepRestore) - 1; i >= 0; i-- {
|
||||
stepRestore[i].fn()
|
||||
var stepCalib map[int]benchmarkPowerCalibrationResult
|
||||
if step == 1 {
|
||||
// Single-GPU step — already measured in phase 1; reuse directly.
|
||||
stepCalib = calibByIndex
|
||||
logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
|
||||
} else {
|
||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
var stepRestore []benchmarkRestoreAction
|
||||
stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
||||
for i := len(stepRestore) - 1; i >= 0; i-- {
|
||||
stepRestore[i].fn()
|
||||
}
|
||||
}
|
||||
ramp := NvidiaPowerBenchStep{
|
||||
StepIndex: step,
|
||||
|
||||
@@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct {
|
||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
OccupiedSlots []int `json:"occupied_slots,omitempty"`
|
||||
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
|
||||
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
|
||||
"bee-audit",
|
||||
"bee-web",
|
||||
"bee-sshsetup",
|
||||
"nvidia-dcgm",
|
||||
"nvidia-fabricmanager",
|
||||
}
|
||||
|
||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
|
||||
@@ -552,9 +552,13 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
|
||||
// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
|
||||
// controller can cause memtester to spin forever on a single subtest.
|
||||
timeoutSec := sizeMB*passes*150/100 + 120
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
||||
}
|
||||
|
||||
if rampUp && len(body.GPUIndices) > 1 {
|
||||
// Ramp-up mode: resolve GPU list, then create one task per prefix
|
||||
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
|
||||
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
|
||||
// in Phase 2 (one additional GPU per step). A single task with all
|
||||
// selected GPUs is sufficient — spawning N tasks with growing subsets
|
||||
// would repeat all earlier steps redundantly.
|
||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
||||
} else {
|
||||
now := time.Now()
|
||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||
var allTasks []*Task
|
||||
for step := 1; step <= len(resolved); step++ {
|
||||
subset := resolved[:step]
|
||||
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
|
||||
t := &Task{
|
||||
ID: newJobID("bee-bench-nvidia"),
|
||||
Name: stepName,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
GPUIndices: append([]int(nil), subset...),
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL && step == len(resolved),
|
||||
ParallelGPUs: true,
|
||||
RampStep: step,
|
||||
RampTotal: len(resolved),
|
||||
RampRunID: rampRunID,
|
||||
DisplayName: stepName,
|
||||
},
|
||||
}
|
||||
allTasks = append(allTasks, t)
|
||||
taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
|
||||
t := &Task{
|
||||
ID: newJobID("bee-bench-nvidia"),
|
||||
Name: taskName,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
GPUIndices: append([]int(nil), resolved...),
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: true,
|
||||
RampTotal: len(resolved),
|
||||
RampRunID: rampRunID,
|
||||
DisplayName: taskName,
|
||||
},
|
||||
}
|
||||
for _, t := range allTasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, allTasks)
|
||||
globalQueue.enqueue(t)
|
||||
writeTaskRunResponse(w, []*Task{t})
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -1529,6 +1523,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
|
||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||
}
|
||||
|
||||
func (h *handler) rollbackPendingNetworkChange() error {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
|
||||
@@ -2002,7 +2002,7 @@ func renderBenchmark(opts HandlerOptions) string {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
` + renderBenchmarkResultsCard(opts.ExportDir) + `
|
||||
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
@@ -2188,7 +2188,9 @@ function runNvidiaBenchmark(kind) {
|
||||
if (e.data) failures += 1;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
const isLast = (idx + 1 >= taskIds.length);
|
||||
streamNext(idx + 1, failures);
|
||||
if (isLast) { benchmarkRefreshResults(); }
|
||||
});
|
||||
benchmarkES.onerror = function() {
|
||||
if (benchmarkES) {
|
||||
@@ -2208,18 +2210,30 @@ function runNvidiaBenchmark(kind) {
|
||||
}
|
||||
|
||||
benchmarkLoadGPUs();
|
||||
|
||||
function benchmarkRefreshResults() {
|
||||
fetch('/api/benchmark/results')
|
||||
.then(function(r) { return r.text(); })
|
||||
.then(function(html) {
|
||||
const el = document.getElementById('benchmark-results-section');
|
||||
if (el) el.innerHTML = html;
|
||||
})
|
||||
.catch(function() {});
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCard(exportDir string) string {
|
||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Perf Results",
|
||||
perf := renderBenchmarkResultsCardFromRuns(
|
||||
"Performance Results",
|
||||
"Composite score by saved benchmark run and GPU.",
|
||||
"No saved benchmark runs yet.",
|
||||
"No saved performance benchmark runs yet.",
|
||||
maxIdx,
|
||||
runs,
|
||||
)
|
||||
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||
return perf + "\n" + power
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||
@@ -2299,6 +2313,126 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
|
||||
return maxGPUIndex, runs
|
||||
}
|
||||
|
||||
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
baseDir := app.DefaultBeeBenchPowerDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
type powerRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
result platform.NvidiaPowerBenchResult
|
||||
}
|
||||
var runs []powerRun
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var r platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &r); err != nil {
|
||||
continue
|
||||
}
|
||||
runs = append(runs, powerRun{
|
||||
generatedAt: r.GeneratedAt,
|
||||
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
result: r,
|
||||
})
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
|
||||
// Show only the most recent run's GPU slot table, plus a run history summary.
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||
|
||||
latest := runs[0].result
|
||||
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||
if latest.Hostname != "" {
|
||||
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||
}
|
||||
if latest.OverallStatus != "" {
|
||||
statusColor := "var(--ok)"
|
||||
if latest.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||
}
|
||||
b.WriteString(`</p>`)
|
||||
|
||||
if len(latest.GPUs) > 0 {
|
||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for _, gpu := range latest.GPUs {
|
||||
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
|
||||
rowStyle := ""
|
||||
achievedStyle := ""
|
||||
if derated {
|
||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||
achievedStyle = ` style="color:#e6a000;font-weight:600"`
|
||||
}
|
||||
statusLabel := gpu.Status
|
||||
if statusLabel == "" {
|
||||
statusLabel = "OK"
|
||||
}
|
||||
statusColor := "var(--ok)"
|
||||
if statusLabel != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
nominalStr := "-"
|
||||
if gpu.DefaultPowerLimitW > 0 {
|
||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||
}
|
||||
achievedStr := "-"
|
||||
if gpu.AppliedPowerLimitW > 0 {
|
||||
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||
}
|
||||
p95Str := "-"
|
||||
if gpu.MaxObservedPowerW > 0 {
|
||||
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||
}
|
||||
b.WriteString(`<tr` + rowStyle + `>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
|
||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div>`)
|
||||
}
|
||||
|
||||
if len(runs) > 1 {
|
||||
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
statusColor := "var(--ok)"
|
||||
if run.result.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></details>`)
|
||||
}
|
||||
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderBurn() string {
|
||||
|
||||
@@ -263,6 +263,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||
|
||||
// Tasks
|
||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
DEBIAN_VERSION=12
|
||||
DEBIAN_KERNEL_ABI=auto
|
||||
NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=1.0.0
|
||||
MEMTEST_VERSION=6.10-4
|
||||
|
||||
@@ -23,9 +23,9 @@ lb config noauto \
|
||||
--bootloaders "grub-efi,syslinux" \
|
||||
--debian-installer none \
|
||||
--archive-areas "main contrib non-free non-free-firmware" \
|
||||
--mirror-bootstrap "https://deb.debian.org/debian" \
|
||||
--mirror-chroot "https://deb.debian.org/debian" \
|
||||
--mirror-binary "https://deb.debian.org/debian" \
|
||||
--mirror-bootstrap "http://mirror.mephi.ru/debian/" \
|
||||
--mirror-chroot "http://mirror.mephi.ru/debian/" \
|
||||
--mirror-binary "http://mirror.mephi.ru/debian/" \
|
||||
--security true \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
|
||||
@@ -161,6 +161,7 @@ run_variant() {
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-e BEE_REQUIRE_MEMTEST=1 \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||
@@ -175,6 +176,7 @@ run_variant() {
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-e BEE_REQUIRE_MEMTEST=1 \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||
|
||||
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export MEMTEST_VERSION
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
@@ -775,6 +776,7 @@ run_optional_step_sh() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
mkdir -p "${LOG_DIR}" 2>/dev/null || true
|
||||
step_log="${LOG_DIR}/${step_slug}.log"
|
||||
echo ""
|
||||
echo "=== optional step: ${step_name} ==="
|
||||
@@ -798,13 +800,14 @@ start_build_log
|
||||
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
||||
echo "=== refreshing apt index to detect current kernel ABI ==="
|
||||
apt-get update -qq
|
||||
apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
|
||||
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
||||
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
||||
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
||||
| head -1)
|
||||
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
||||
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
||||
echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
||||
@@ -1259,6 +1262,7 @@ fi
|
||||
# --- substitute version placeholders in package list and archive ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
sed -i \
|
||||
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
|
||||
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||
# Enable GPU-vendor specific services
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||
systemctl enable bee-nvidia.service
|
||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||
|
||||
@@ -5,6 +5,8 @@ set -e
|
||||
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
|
||||
# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
|
||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||
BINARY_BOOT_DIR="binary/boot"
|
||||
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||
@@ -24,15 +26,23 @@ fail_or_warn() {
|
||||
return 0
|
||||
}
|
||||
|
||||
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
|
||||
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
|
||||
# The template already has memtest entries hardcoded, so a missing config file
|
||||
# here is not an error; validate_iso_memtest() checks the final ISO instead.
|
||||
warn_only() {
|
||||
log "WARNING: $1"
|
||||
}
|
||||
|
||||
copy_memtest_file() {
|
||||
src="$1"
|
||||
base="$(basename "$src")"
|
||||
dst="${BINARY_BOOT_DIR}/${base}"
|
||||
dst_name="${2:-$(basename "$src")}"
|
||||
dst="${BINARY_BOOT_DIR}/${dst_name}"
|
||||
|
||||
[ -f "$src" ] || return 1
|
||||
mkdir -p "${BINARY_BOOT_DIR}"
|
||||
cp "$src" "$dst"
|
||||
log "copied ${base} from ${src}"
|
||||
log "copied ${dst_name} from ${src}"
|
||||
}
|
||||
|
||||
extract_memtest_from_deb() {
|
||||
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {
|
||||
|
||||
log "extracting memtest payload from ${deb}"
|
||||
dpkg-deb -x "$deb" "$tmpdir"
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ -f "${tmpdir}/boot/${f}" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/${f}"
|
||||
fi
|
||||
done
|
||||
|
||||
# EFI binary: both 5.x and 6.x use memtest86+x64.efi
|
||||
if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
|
||||
fi
|
||||
|
||||
# BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
|
||||
if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
|
||||
elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
|
||||
fi
|
||||
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
|
||||
download_and_extract_memtest() {
|
||||
tmpdl="$(mktemp -d)"
|
||||
if [ -n "${MEMTEST_VERSION:-}" ]; then
|
||||
pkg_spec="memtest86+=${MEMTEST_VERSION}"
|
||||
else
|
||||
pkg_spec="memtest86+"
|
||||
fi
|
||||
log "downloading ${pkg_spec} from apt"
|
||||
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
|
||||
log "apt download failed, retrying after apt-get update"
|
||||
apt-get update -qq >/dev/null 2>&1 || true
|
||||
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
|
||||
fi
|
||||
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||
if [ -n "$deb" ]; then
|
||||
extract_memtest_from_deb "$deb"
|
||||
else
|
||||
log "apt download of memtest86+ failed"
|
||||
fi
|
||||
rm -rf "$tmpdl"
|
||||
}
|
||||
|
||||
ensure_memtest_binaries() {
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 1. Try files already placed by lb binary_memtest or chroot
|
||||
for root in chroot/boot /boot; do
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||
done
|
||||
# 6.x BIOS binary may lack x64 in name — copy with normalised name
|
||||
if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
|
||||
copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
|
||||
fi
|
||||
done
|
||||
|
||||
missing=0
|
||||
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 2. Try apt package cache (may be empty if lb binary_memtest already purged)
|
||||
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||
[ -d "$root" ] || continue
|
||||
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
|
||||
break
|
||||
done
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
|
||||
download_and_extract_memtest
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {
|
||||
|
||||
ensure_grub_entry() {
|
||||
[ -f "$GRUB_CFG" ] || {
|
||||
fail_or_warn "missing ${GRUB_CFG}"
|
||||
warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -114,7 +169,7 @@ EOF
|
||||
|
||||
ensure_isolinux_entry() {
|
||||
[ -f "$ISOLINUX_CFG" ] || {
|
||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
||||
warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||
# explicitly.
|
||||
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
|
||||
@@ -258,6 +258,22 @@ else
|
||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||
fi
|
||||
|
||||
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||
# training completes under nvidia-fabricmanager.
|
||||
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager restarted"
|
||||
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager started"
|
||||
else
|
||||
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||
fi
|
||||
else
|
||||
log "WARN: nvidia-fabricmanager.service not installed"
|
||||
fi
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||
|
||||
Reference in New Issue
Block a user