Compare commits
32 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 30aa30cd67 | |||
| 4f76e1de21 | |||
| 3732e64a4a | |||
| 0d925299ff | |||
| a8d5e019a5 | |||
| 72ec086568 | |||
| 7a0b0934df | |||
| d8ca0dca2c | |||
| d90250f80a | |||
| 8d6eaef5de | |||
| 732bf4cbab | |||
| fa6d905a10 | |||
|
|
5c1862ce4c | ||
|
|
b65ef2ea1d | ||
|
|
533d703c97 | ||
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d | |||
| a6a07f2626 |
@@ -5,22 +5,18 @@ go 1.25.0
|
|||||||
replace reanimator/chart => ../internal/chart
|
replace reanimator/chart => ../internal/chart
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/go-analyze/charts v0.5.26
|
modernc.org/sqlite v1.48.0
|
||||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
golang.org/x/image v0.24.0 // indirect
|
|
||||||
golang.org/x/sys v0.42.0 // indirect
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
modernc.org/libc v1.70.0 // indirect
|
modernc.org/libc v1.72.0 // indirect
|
||||||
modernc.org/mathutil v1.7.1 // indirect
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
modernc.org/memory v1.11.0 // indirect
|
modernc.org/memory v1.11.0 // indirect
|
||||||
modernc.org/sqlite v1.48.0 // indirect
|
|
||||||
)
|
)
|
||||||
|
|||||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
|||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
|
||||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
|
||||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||||
|
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||||
|
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||||
|
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||||
|
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||||
|
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||||
|
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||||
|
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||||
|
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||||
|
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||||
|
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||||
|
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||||
|
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||||
|
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||||
|
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||||
|
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||||
|
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||||
|
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||||
|
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||||
|
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
|
|||||||
"bee-selfheal.service",
|
"bee-selfheal.service",
|
||||||
"bee-selfheal.timer",
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
|
"nvidia-dcgm.service",
|
||||||
|
"nvidia-fabricmanager.service",
|
||||||
}
|
}
|
||||||
|
|
||||||
var supportBundleCommands = []struct {
|
var supportBundleCommands = []struct {
|
||||||
@@ -48,6 +50,43 @@ else
|
|||||||
fi
|
fi
|
||||||
`}},
|
`}},
|
||||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
nvidia-smi topo -m 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "nvidia-smi not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v systemctl >/dev/null 2>&1; then
|
||||||
|
echo "systemctl not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo "=== unit files ==="
|
||||||
|
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== active units ==="
|
||||||
|
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== failed units ==="
|
||||||
|
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||||
|
`}},
|
||||||
|
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||||
|
for candidate in \
|
||||||
|
/usr/bin/nvidia-fabricmanager \
|
||||||
|
/usr/bin/nv-fabricmanager \
|
||||||
|
/usr/bin/nvidia-fabricmanagerd \
|
||||||
|
/usr/bin/nvlsm; do
|
||||||
|
if [ -e "$candidate" ]; then
|
||||||
|
echo "=== $candidate ==="
|
||||||
|
ls -l "$candidate" 2>&1 || true
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||||
|
echo "no fabric manager binaries found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||||
if ! command -v lspci >/dev/null 2>&1; then
|
if ! command -v lspci >/dev/null 2>&1; then
|
||||||
echo "lspci not found"
|
echo "lspci not found"
|
||||||
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
|
|||||||
}{
|
}{
|
||||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
|
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||||
|
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||||
|
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||||
|
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||||
}
|
}
|
||||||
|
|
||||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if result.ScalabilityScore > 0 {
|
if result.ScalabilityScore > 0 {
|
||||||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||||
}
|
}
|
||||||
|
if result.PlatformPowerScore > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||||||
|
}
|
||||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
@@ -81,41 +84,92 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Methodology ───────────────────────────────────────────────────────────
|
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||||||
b.WriteString("## Methodology\n\n")
|
b.WriteString("## Balanced Scorecard\n\n")
|
||||||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
|
|
||||||
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
|
|
||||||
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
|
|
||||||
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
|
|
||||||
b.WriteString("**Compute score** is derived from two phases:\n\n")
|
|
||||||
b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
|
|
||||||
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
|
|
||||||
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
|
|
||||||
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
|
|
||||||
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
|
|
||||||
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
|
|
||||||
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
|
|
||||||
b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
|
|
||||||
b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
|
|
||||||
b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
|
|
||||||
|
|
||||||
// ── Scorecard table ───────────────────────────────────────────────────────
|
// Perspective 1: Compatibility — hard stops
|
||||||
b.WriteString("## Scorecard\n\n")
|
b.WriteString("### 1. Compatibility\n\n")
|
||||||
b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
|
||||||
b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
|
b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
name := strings.TrimSpace(gpu.Name)
|
thermalThrottle := "-"
|
||||||
if name == "" {
|
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
name = "Unknown GPU"
|
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||||
}
|
}
|
||||||
interconnect := "-"
|
fanAtThrottle := "-"
|
||||||
if gpu.Scores.InterconnectScore > 0 {
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||||
}
|
}
|
||||||
topsPerSM := "-"
|
ecc := "-"
|
||||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
if gpu.ECC.Uncorrected > 0 {
|
||||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||||
}
|
}
|
||||||
|
compatStatus := "✓ OK"
|
||||||
|
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||||
|
compatStatus = "⛔ HARD STOP"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||||||
|
gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Perspective 2: Thermal headroom
|
||||||
|
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||||
|
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
|
||||||
|
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
shutdownTemp := gpu.ShutdownTempC
|
||||||
|
if shutdownTemp <= 0 {
|
||||||
|
shutdownTemp = 90
|
||||||
|
}
|
||||||
|
slowdownTemp := gpu.SlowdownTempC
|
||||||
|
if slowdownTemp <= 0 {
|
||||||
|
slowdownTemp = 80
|
||||||
|
}
|
||||||
|
headroom := gpu.Scores.TempHeadroomC
|
||||||
|
thermalStatus := "✓ OK"
|
||||||
|
switch {
|
||||||
|
case headroom < 10:
|
||||||
|
thermalStatus = "⛔ CRITICAL"
|
||||||
|
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||||
|
thermalStatus = "⚠ WARNING"
|
||||||
|
}
|
||||||
|
throttlePct := "-"
|
||||||
|
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
|
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
|
||||||
|
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Perspective 3: Power delivery
|
||||||
|
b.WriteString("### 3. Power Delivery\n\n")
|
||||||
|
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
|
||||||
|
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
powerCap := "-"
|
||||||
|
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||||
|
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||||
|
}
|
||||||
|
fanDuty := "-"
|
||||||
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||||
|
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
powerStatus := "✓ OK"
|
||||||
|
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||||
|
powerStatus = "⚠ POWER LIMITED"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
|
||||||
|
gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Perspective 4: Performance
|
||||||
|
b.WriteString("### 4. Performance\n\n")
|
||||||
|
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
|
||||||
|
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
synthetic := "-"
|
synthetic := "-"
|
||||||
if gpu.Scores.SyntheticScore > 0 {
|
if gpu.Scores.SyntheticScore > 0 {
|
||||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||||
@@ -128,20 +182,41 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if gpu.Scores.MixedEfficiency > 0 {
|
if gpu.Scores.MixedEfficiency > 0 {
|
||||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
|
topsPerSM := "-"
|
||||||
gpu.Index, name,
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
gpu.Status,
|
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
gpu.Scores.CompositeScore,
|
}
|
||||||
gpu.Scores.ComputeScore,
|
fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
|
||||||
synthetic,
|
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
|
||||||
mixed,
|
}
|
||||||
mixedEff,
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
topsPerSM,
|
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||||
gpu.Scores.PowerSustainScore,
|
}
|
||||||
gpu.Scores.ThermalSustainScore,
|
b.WriteString("\n")
|
||||||
gpu.Scores.StabilityScore,
|
|
||||||
interconnect,
|
// Perspective 5: Anomaly flags
|
||||||
)
|
b.WriteString("### 5. Anomalies\n\n")
|
||||||
|
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
|
||||||
|
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
eccCorr := "-"
|
||||||
|
if gpu.ECC.Corrected > 0 {
|
||||||
|
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||||
|
}
|
||||||
|
syncBoost := "-"
|
||||||
|
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||||
|
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||||
|
}
|
||||||
|
powerVar := "OK"
|
||||||
|
if gpu.Scores.PowerSustainScore < 70 {
|
||||||
|
powerVar = "⚠ unstable"
|
||||||
|
}
|
||||||
|
thermalVar := "OK"
|
||||||
|
if gpu.Scores.ThermalSustainScore < 70 {
|
||||||
|
thermalVar = "⚠ unstable"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||||||
|
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
@@ -171,13 +246,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||||
}
|
}
|
||||||
if gpu.PowerLimitDerated {
|
if gpu.PowerLimitDerated {
|
||||||
fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
|
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||||||
}
|
}
|
||||||
if gpu.CalibratedPeakPowerW > 0 {
|
if gpu.CalibratedPeakPowerW > 0 {
|
||||||
if gpu.CalibratedPeakTempC > 0 {
|
if gpu.CalibratedPeakTempC > 0 {
|
||||||
fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||||||
} else {
|
} else {
|
||||||
fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if gpu.LockedGraphicsClockMHz > 0 {
|
if gpu.LockedGraphicsClockMHz > 0 {
|
||||||
@@ -329,6 +404,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Platform Scalability ──────────────────────────────────────────────────
|
||||||
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
|
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||||
|
b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
|
||||||
|
b.WriteString("|--------|-------------|----------------------|-------------|\n")
|
||||||
|
for _, step := range result.PerformanceRampSteps {
|
||||||
|
fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
|
||||||
|
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||||
b.WriteString("## Raw Files\n\n")
|
b.WriteString("## Raw Files\n\n")
|
||||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||||
|
|||||||
@@ -65,6 +65,11 @@ type NvidiaBenchmarkResult struct {
|
|||||||
RampTotal int `json:"ramp_total,omitempty"`
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||||
|
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||||
|
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||||
|
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||||
|
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||||
|
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
@@ -107,6 +112,12 @@ type BenchmarkGPUResult struct {
|
|||||||
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
|
||||||
|
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
|
||||||
|
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
|
||||||
|
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
|
||||||
|
// Fallback: 80°C.
|
||||||
|
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
|
||||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||||
// dcgmi targeted_power calibration run before the main benchmark.
|
// dcgmi targeted_power calibration run before the main benchmark.
|
||||||
// Used as the reference denominator for PowerSustainScore instead of
|
// Used as the reference denominator for PowerSustainScore instead of
|
||||||
@@ -206,9 +217,30 @@ type BenchmarkScorecard struct {
|
|||||||
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||||
PowerSustainScore float64 `json:"power_sustain_score"`
|
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||||
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||||
StabilityScore float64 `json:"stability_score"`
|
// StabilityScore: fraction of steady-state time the GPU spent throttling
|
||||||
InterconnectScore float64 `json:"interconnect_score"`
|
// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
|
||||||
CompositeScore float64 `json:"composite_score"`
|
StabilityScore float64 `json:"stability_score"`
|
||||||
|
|
||||||
|
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||||
|
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||||
|
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||||
|
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||||
|
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||||
|
|
||||||
|
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||||
|
// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
|
||||||
|
// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
|
||||||
|
TempHeadroomC float64 `json:"temp_headroom_c"`
|
||||||
|
|
||||||
|
InterconnectScore float64 `json:"interconnect_score"`
|
||||||
|
// ServerQualityScore (0–100) reflects server infrastructure quality independent
|
||||||
|
// of GPU model. Combines throttle time, power variance, and temp variance.
|
||||||
|
// Use this to compare servers with the same GPU, or to flag a bad server
|
||||||
|
// that throttles an otherwise fast GPU.
|
||||||
|
ServerQualityScore float64 `json:"server_quality_score"`
|
||||||
|
// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
|
||||||
|
// A throttling GPU will score lower here automatically — no quality multiplier.
|
||||||
|
CompositeScore float64 `json:"composite_score"`
|
||||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||||
}
|
}
|
||||||
@@ -265,8 +297,12 @@ type NvidiaPowerBenchResult struct {
|
|||||||
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||||
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||||
|
// this server under full GPU load. Use for rack power planning.
|
||||||
|
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||||
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchGPU struct {
|
type NvidiaPowerBenchGPU struct {
|
||||||
@@ -274,27 +310,50 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
Name string `json:"name,omitempty"`
|
Name string `json:"name,omitempty"`
|
||||||
BusID string `json:"bus_id,omitempty"`
|
BusID string `json:"bus_id,omitempty"`
|
||||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||||
|
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||||
|
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||||
|
// stably with all other GPUs running simultaneously at their own limits.
|
||||||
|
// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
|
||||||
|
// additional derating.
|
||||||
|
StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"`
|
||||||
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||||
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||||
Derated bool `json:"derated,omitempty"`
|
Derated bool `json:"derated,omitempty"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
OccupiedSlots []int `json:"occupied_slots,omitempty"`
|
|
||||||
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
|
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchStep struct {
|
type NvidiaPowerBenchStep struct {
|
||||||
StepIndex int `json:"step_index"`
|
StepIndex int `json:"step_index"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||||
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
NewGPUIndex int `json:"new_gpu_index"`
|
||||||
MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"`
|
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||||
AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"`
|
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||||
DeratedGPUCount int `json:"derated_gpu_count,omitempty"`
|
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||||
Status string `json:"status"`
|
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Derated bool `json:"derated,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||||
|
// scalability ramp-up phase of the performance benchmark.
|
||||||
|
type NvidiaPerformanceRampStep struct {
|
||||||
|
StepIndex int `json:"step_index"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||||
|
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||||
|
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||||
|
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||||
|
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||||
|
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||||
|
ScalabilityPct float64 `json:"scalability_pct"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
|
|||||||
"bee-audit",
|
"bee-audit",
|
||||||
"bee-web",
|
"bee-web",
|
||||||
"bee-sshsetup",
|
"bee-sshsetup",
|
||||||
|
"nvidia-dcgm",
|
||||||
|
"nvidia-fabricmanager",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
|||||||
@@ -426,6 +426,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
satJob{
|
satJob{
|
||||||
@@ -443,6 +450,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
satJob{
|
satJob{
|
||||||
@@ -460,6 +474,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
satJob{
|
satJob{
|
||||||
@@ -552,9 +573,19 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
|
|||||||
if passes <= 0 {
|
if passes <= 0 {
|
||||||
passes = 1
|
passes = 1
|
||||||
}
|
}
|
||||||
|
// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
|
||||||
|
// intentionally conservative enough for healthy systems while avoiding the
|
||||||
|
// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
|
||||||
|
timeoutSec := sizeMB*passes*20/100 + 60
|
||||||
|
if timeoutSec < 180 {
|
||||||
|
timeoutSec = 180
|
||||||
|
}
|
||||||
|
if timeoutSec > 900 {
|
||||||
|
timeoutSec = 900
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
|||||||
}
|
}
|
||||||
|
|
||||||
if rampUp && len(body.GPUIndices) > 1 {
|
if rampUp && len(body.GPUIndices) > 1 {
|
||||||
// Ramp-up mode: resolve GPU list, then create one task per prefix
|
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
|
||||||
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
|
// in Phase 2 (one additional GPU per step). A single task with all
|
||||||
|
// selected GPUs is sufficient — spawning N tasks with growing subsets
|
||||||
|
// would repeat all earlier steps redundantly.
|
||||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusBadRequest, err.Error())
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
|||||||
} else {
|
} else {
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||||
var allTasks []*Task
|
taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
|
||||||
for step := 1; step <= len(resolved); step++ {
|
t := &Task{
|
||||||
subset := resolved[:step]
|
ID: newJobID("bee-bench-nvidia"),
|
||||||
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
|
Name: taskName,
|
||||||
t := &Task{
|
Target: target,
|
||||||
ID: newJobID("bee-bench-nvidia"),
|
Priority: defaultTaskPriority(target, taskParams{}),
|
||||||
Name: stepName,
|
Status: TaskPending,
|
||||||
Target: target,
|
CreatedAt: now,
|
||||||
Priority: defaultTaskPriority(target, taskParams{}),
|
params: taskParams{
|
||||||
Status: TaskPending,
|
GPUIndices: append([]int(nil), resolved...),
|
||||||
CreatedAt: now,
|
SizeMB: body.SizeMB,
|
||||||
params: taskParams{
|
BenchmarkProfile: body.Profile,
|
||||||
GPUIndices: append([]int(nil), subset...),
|
RunNCCL: runNCCL,
|
||||||
SizeMB: body.SizeMB,
|
ParallelGPUs: true,
|
||||||
BenchmarkProfile: body.Profile,
|
RampTotal: len(resolved),
|
||||||
RunNCCL: runNCCL && step == len(resolved),
|
RampRunID: rampRunID,
|
||||||
ParallelGPUs: true,
|
DisplayName: taskName,
|
||||||
RampStep: step,
|
},
|
||||||
RampTotal: len(resolved),
|
|
||||||
RampRunID: rampRunID,
|
|
||||||
DisplayName: stepName,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
allTasks = append(allTasks, t)
|
|
||||||
}
|
}
|
||||||
for _, t := range allTasks {
|
globalQueue.enqueue(t)
|
||||||
globalQueue.enqueue(t)
|
writeTaskRunResponse(w, []*Task{t})
|
||||||
}
|
|
||||||
writeTaskRunResponse(w, allTasks)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -743,6 +737,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
|||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
t.job.abort()
|
t.job.abort()
|
||||||
}
|
}
|
||||||
|
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||||
|
platform.KillTestWorkers()
|
||||||
|
}
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
@@ -1529,6 +1526,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
|
|||||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) rollbackPendingNetworkChange() error {
|
func (h *handler) rollbackPendingNetworkChange() error {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
|
|||||||
@@ -72,6 +72,13 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
|||||||
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||||
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||||
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||||
|
/* Component chips — one small square per device */
|
||||||
|
.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
|
||||||
|
.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
|
||||||
|
.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||||
|
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||||
|
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||||
|
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||||
/* Output terminal */
|
/* Output terminal */
|
||||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||||
@@ -363,23 +370,25 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
||||||
}
|
}
|
||||||
|
|
||||||
cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
|
writeRow("CPU", hwDescribeCPU(hw),
|
||||||
writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
|
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))
|
||||||
|
|
||||||
memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
|
writeRow("Memory", hwDescribeMemory(hw),
|
||||||
writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
|
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))
|
||||||
|
|
||||||
storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
|
writeRow("Storage", hwDescribeStorage(hw),
|
||||||
writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
|
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))
|
||||||
|
|
||||||
gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
|
writeRow("GPU", hwDescribeGPU(hw),
|
||||||
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
|
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))
|
||||||
|
|
||||||
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
|
psuMatched := matchedRecords(records, nil, []string{"psu:"})
|
||||||
if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
|
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
|
||||||
psuRow.Status = hwPSUStatus(hw.PowerSupplies)
|
// No PSU records yet — synthesise a single chip from IPMI status.
|
||||||
|
psuStatus := hwPSUStatus(hw.PowerSupplies)
|
||||||
|
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
|
||||||
}
|
}
|
||||||
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
|
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))
|
||||||
|
|
||||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||||
writeRow("Network", nicDesc, "")
|
writeRow("Network", nicDesc, "")
|
||||||
@@ -892,6 +901,31 @@ func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// matchedRecords returns all ComponentStatusRecord entries whose key matches
|
||||||
|
// any exact key or any of the given prefixes. Used for per-device chip rendering.
|
||||||
|
func firstNonEmpty(vals ...string) string {
|
||||||
|
for _, v := range vals {
|
||||||
|
if v != "" {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func matchedRecords(records []app.ComponentStatusRecord, exact []string, prefixes []string) []app.ComponentStatusRecord {
|
||||||
|
var matched []app.ComponentStatusRecord
|
||||||
|
for _, rec := range records {
|
||||||
|
key := strings.TrimSpace(rec.ComponentKey)
|
||||||
|
if key == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
|
||||||
|
matched = append(matched, rec)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return matched
|
||||||
|
}
|
||||||
|
|
||||||
func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
|
func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
|
||||||
matched := make([]app.ComponentStatusRecord, 0)
|
matched := make([]app.ComponentStatusRecord, 0)
|
||||||
for _, rec := range records {
|
for _, rec := range records {
|
||||||
@@ -1034,6 +1068,52 @@ func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) str
|
|||||||
return strings.Join(messages, "; ")
|
return strings.Join(messages, "; ")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// chipLetterClass maps a component status to a single display letter and CSS class.
|
||||||
|
func chipLetterClass(status string) (letter, cls string) {
|
||||||
|
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||||||
|
case "OK":
|
||||||
|
return "O", "chip-ok"
|
||||||
|
case "WARNING", "WARN", "PARTIAL":
|
||||||
|
return "W", "chip-warn"
|
||||||
|
case "CRITICAL", "FAIL", "FAILED", "ERROR":
|
||||||
|
return "F", "chip-fail"
|
||||||
|
default:
|
||||||
|
return "?", "chip-unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// renderComponentChips renders one 20×20 chip per ComponentStatusRecord.
|
||||||
|
// Hover tooltip shows component key, status, error summary and last check time.
|
||||||
|
// Falls back to a single unknown chip when no records are available.
|
||||||
|
func renderComponentChips(matched []app.ComponentStatusRecord) string {
|
||||||
|
if len(matched) == 0 {
|
||||||
|
return `<span class="chips"><span class="chip chip-unknown" title="No data">?</span></span>`
|
||||||
|
}
|
||||||
|
sort.Slice(matched, func(i, j int) bool {
|
||||||
|
return matched[i].ComponentKey < matched[j].ComponentKey
|
||||||
|
})
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<span class="chips">`)
|
||||||
|
for _, rec := range matched {
|
||||||
|
letter, cls := chipLetterClass(rec.Status)
|
||||||
|
var tooltip strings.Builder
|
||||||
|
tooltip.WriteString(rec.ComponentKey)
|
||||||
|
tooltip.WriteString(": ")
|
||||||
|
tooltip.WriteString(firstNonEmpty(rec.Status, "UNKNOWN"))
|
||||||
|
if rec.ErrorSummary != "" {
|
||||||
|
tooltip.WriteString(" — ")
|
||||||
|
tooltip.WriteString(rec.ErrorSummary)
|
||||||
|
}
|
||||||
|
if !rec.LastCheckedAt.IsZero() {
|
||||||
|
fmt.Fprintf(&tooltip, " (checked %s)", rec.LastCheckedAt.Format("15:04:05"))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<span class="chip %s" title="%s">%s</span>`,
|
||||||
|
cls, html.EscapeString(tooltip.String()), letter)
|
||||||
|
}
|
||||||
|
b.WriteString(`</span>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
func runtimeStatusBadge(status string) string {
|
func runtimeStatusBadge(status string) string {
|
||||||
status = strings.ToUpper(strings.TrimSpace(status))
|
status = strings.ToUpper(strings.TrimSpace(status))
|
||||||
badge := "badge-unknown"
|
badge := "badge-unknown"
|
||||||
@@ -1339,7 +1419,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.Memory,
|
inv.Memory,
|
||||||
`Runs a RAM validation pass and records memory state around the test.`,
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
`<code>free</code>, <code>memtester</code>`,
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
|
`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
inv.Storage,
|
inv.Storage,
|
||||||
@@ -2002,7 +2082,7 @@ func renderBenchmark(opts HandlerOptions) string {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
` + renderBenchmarkResultsCard(opts.ExportDir) + `
|
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
|
||||||
|
|
||||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||||
@@ -2188,7 +2268,9 @@ function runNvidiaBenchmark(kind) {
|
|||||||
if (e.data) failures += 1;
|
if (e.data) failures += 1;
|
||||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
term.scrollTop = term.scrollHeight;
|
term.scrollTop = term.scrollHeight;
|
||||||
|
const isLast = (idx + 1 >= taskIds.length);
|
||||||
streamNext(idx + 1, failures);
|
streamNext(idx + 1, failures);
|
||||||
|
if (isLast) { benchmarkRefreshResults(); }
|
||||||
});
|
});
|
||||||
benchmarkES.onerror = function() {
|
benchmarkES.onerror = function() {
|
||||||
if (benchmarkES) {
|
if (benchmarkES) {
|
||||||
@@ -2208,18 +2290,30 @@ function runNvidiaBenchmark(kind) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
benchmarkLoadGPUs();
|
benchmarkLoadGPUs();
|
||||||
|
|
||||||
|
function benchmarkRefreshResults() {
|
||||||
|
fetch('/api/benchmark/results')
|
||||||
|
.then(function(r) { return r.text(); })
|
||||||
|
.then(function(html) {
|
||||||
|
const el = document.getElementById('benchmark-results-section');
|
||||||
|
if (el) el.innerHTML = html;
|
||||||
|
})
|
||||||
|
.catch(function() {});
|
||||||
|
}
|
||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderBenchmarkResultsCard(exportDir string) string {
|
func renderBenchmarkResultsCard(exportDir string) string {
|
||||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||||
return renderBenchmarkResultsCardFromRuns(
|
perf := renderBenchmarkResultsCardFromRuns(
|
||||||
"Perf Results",
|
"Performance Results",
|
||||||
"Composite score by saved benchmark run and GPU.",
|
"Composite score by saved benchmark run and GPU.",
|
||||||
"No saved benchmark runs yet.",
|
"No saved performance benchmark runs yet.",
|
||||||
maxIdx,
|
maxIdx,
|
||||||
runs,
|
runs,
|
||||||
)
|
)
|
||||||
|
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||||
|
return perf + "\n" + power
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||||
@@ -2299,6 +2393,126 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
|
|||||||
return maxGPUIndex, runs
|
return maxGPUIndex, runs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||||
|
baseDir := app.DefaultBeeBenchPowerDir
|
||||||
|
if strings.TrimSpace(exportDir) != "" {
|
||||||
|
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||||
|
}
|
||||||
|
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||||
|
if err != nil || len(paths) == 0 {
|
||||||
|
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||||
|
}
|
||||||
|
sort.Strings(paths)
|
||||||
|
|
||||||
|
type powerRun struct {
|
||||||
|
generatedAt time.Time
|
||||||
|
displayTime string
|
||||||
|
result platform.NvidiaPowerBenchResult
|
||||||
|
}
|
||||||
|
var runs []powerRun
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var r platform.NvidiaPowerBenchResult
|
||||||
|
if err := json.Unmarshal(raw, &r); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
runs = append(runs, powerRun{
|
||||||
|
generatedAt: r.GeneratedAt,
|
||||||
|
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
|
result: r,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Show only the most recent run's GPU slot table, plus a run history summary.
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||||
|
|
||||||
|
latest := runs[0].result
|
||||||
|
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||||
|
if latest.Hostname != "" {
|
||||||
|
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||||
|
}
|
||||||
|
if latest.OverallStatus != "" {
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if latest.OverallStatus != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</p>`)
|
||||||
|
|
||||||
|
if len(latest.GPUs) > 0 {
|
||||||
|
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||||
|
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||||
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
|
for _, gpu := range latest.GPUs {
|
||||||
|
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
|
||||||
|
rowStyle := ""
|
||||||
|
achievedStyle := ""
|
||||||
|
if derated {
|
||||||
|
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||||
|
achievedStyle = ` style="color:#e6a000;font-weight:600"`
|
||||||
|
}
|
||||||
|
statusLabel := gpu.Status
|
||||||
|
if statusLabel == "" {
|
||||||
|
statusLabel = "OK"
|
||||||
|
}
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if statusLabel != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
nominalStr := "-"
|
||||||
|
if gpu.DefaultPowerLimitW > 0 {
|
||||||
|
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
achievedStr := "-"
|
||||||
|
if gpu.AppliedPowerLimitW > 0 {
|
||||||
|
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
p95Str := "-"
|
||||||
|
if gpu.MaxObservedPowerW > 0 {
|
||||||
|
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||||
|
}
|
||||||
|
b.WriteString(`<tr` + rowStyle + `>`)
|
||||||
|
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||||
|
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
|
||||||
|
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||||
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(runs) > 1 {
|
||||||
|
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||||
|
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||||
|
for i, run := range runs {
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if run.result.OverallStatus != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(`<tr>`)
|
||||||
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||||
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div></details>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderBurn() string {
|
func renderBurn() string {
|
||||||
|
|||||||
@@ -263,6 +263,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||||
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||||
|
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||||
|
|
||||||
// Tasks
|
// Tasks
|
||||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||||
|
|||||||
@@ -162,6 +162,32 @@ type nvidiaRampSpec struct {
|
|||||||
TotalDurationSec int
|
TotalDurationSec int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||||
|
case "overnight":
|
||||||
|
return 1024, 2
|
||||||
|
case "acceptance":
|
||||||
|
return 1024, 1
|
||||||
|
case "smoke":
|
||||||
|
return 256, 1
|
||||||
|
}
|
||||||
|
if stress {
|
||||||
|
return 512, 1
|
||||||
|
}
|
||||||
|
return 256, 1
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskMayLeaveOrphanWorkers(target string) bool {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(target)) {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
|
||||||
|
"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func resolveBurnPreset(profile string) burnPreset {
|
func resolveBurnPreset(profile string) burnPreset {
|
||||||
switch profile {
|
switch profile {
|
||||||
case "overnight":
|
case "overnight":
|
||||||
@@ -751,10 +777,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
sizeMB, passes := 256, 1
|
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||||
if t.params.StressMode {
|
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||||
sizeMB, passes = 1024, 3
|
|
||||||
}
|
|
||||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||||
case "storage":
|
case "storage":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
@@ -1010,6 +1034,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
|||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
t.job.abort()
|
t.job.abort()
|
||||||
}
|
}
|
||||||
|
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||||
|
platform.KillTestWorkers()
|
||||||
|
}
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
taskSerialEvent(t, "finished with status="+t.Status)
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
@@ -1037,6 +1064,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
|||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
t.job.abort()
|
t.job.abort()
|
||||||
}
|
}
|
||||||
|
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||||
|
platform.KillTestWorkers()
|
||||||
|
}
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
taskSerialEvent(t, "finished with status="+t.Status)
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
@@ -1141,10 +1171,13 @@ func (q *taskQueue) loadLocked() {
|
|||||||
q.assignTaskLogPathLocked(t)
|
q.assignTaskLogPathLocked(t)
|
||||||
if t.Status == TaskRunning {
|
if t.Status == TaskRunning {
|
||||||
// The task was interrupted by a bee-web restart. Child processes
|
// The task was interrupted by a bee-web restart. Child processes
|
||||||
// (e.g. bee-gpu-burn-worker) survive the restart in their own
|
// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
|
||||||
// process groups and cannot be cancelled retroactively. Mark the
|
// their own process groups. Kill any matching stale workers before
|
||||||
// task as failed so the user can decide whether to re-run it
|
// marking the task failed so the next GPU test does not inherit a
|
||||||
// rather than blindly re-launching duplicate workers.
|
// busy DCGM slot or duplicate workers.
|
||||||
|
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||||
|
_ = platform.KillTestWorkers()
|
||||||
|
}
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.Status = TaskFailed
|
t.Status = TaskFailed
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
|||||||
@@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
|
||||||
|
var gotSizeMB, gotPasses int
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "mem-validate-1",
|
||||||
|
Name: "Memory SAT",
|
||||||
|
Target: "memory",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{StressMode: true},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
orig := runMemoryAcceptancePackCtx
|
||||||
|
runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
|
||||||
|
gotSizeMB = sizeMB
|
||||||
|
gotPasses = passes
|
||||||
|
return "/tmp/memory-validate.tar.gz", nil
|
||||||
|
}
|
||||||
|
defer func() { runMemoryAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotSizeMB != 512 || gotPasses != 1 {
|
||||||
|
t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
q := &taskQueue{
|
q := &taskQueue{
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
DEBIAN_VERSION=12
|
DEBIAN_VERSION=12
|
||||||
DEBIAN_KERNEL_ABI=auto
|
DEBIAN_KERNEL_ABI=auto
|
||||||
NVIDIA_DRIVER_VERSION=590.48.01
|
NVIDIA_DRIVER_VERSION=590.48.01
|
||||||
|
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||||
NCCL_VERSION=2.28.9-1
|
NCCL_VERSION=2.28.9-1
|
||||||
NCCL_CUDA_VERSION=13.0
|
NCCL_CUDA_VERSION=13.0
|
||||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||||
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
|||||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
AUDIT_VERSION=1.0.0
|
AUDIT_VERSION=1.0.0
|
||||||
|
MEMTEST_VERSION=6.10-4
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ lb config noauto \
|
|||||||
--bootloaders "grub-efi,syslinux" \
|
--bootloaders "grub-efi,syslinux" \
|
||||||
--debian-installer none \
|
--debian-installer none \
|
||||||
--archive-areas "main contrib non-free non-free-firmware" \
|
--archive-areas "main contrib non-free non-free-firmware" \
|
||||||
--mirror-bootstrap "https://deb.debian.org/debian" \
|
--mirror-bootstrap "http://mirror.mephi.ru/debian/" \
|
||||||
--mirror-chroot "https://deb.debian.org/debian" \
|
--mirror-chroot "http://mirror.mephi.ru/debian/" \
|
||||||
--mirror-binary "https://deb.debian.org/debian" \
|
--mirror-binary "http://mirror.mephi.ru/debian/" \
|
||||||
--security true \
|
--security true \
|
||||||
--linux-flavours "amd64" \
|
--linux-flavours "amd64" \
|
||||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
@@ -33,6 +33,7 @@ lb config noauto \
|
|||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
|
--debootstrap-options "--include=ca-certificates" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -35,6 +35,8 @@ typedef void *CUstream;
|
|||||||
#define MAX_STRESS_STREAMS 16
|
#define MAX_STRESS_STREAMS 16
|
||||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
|
#define MAX_SINGLE_PRECISION_STREAMS 4
|
||||||
|
#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
|
|||||||
return stream_count;
|
return stream_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
|
||||||
|
if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
|
||||||
|
return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
|
||||||
|
}
|
||||||
|
return profile_budget_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||||
if (!api->cuStreamDestroy) {
|
if (!api->cuStreamDestroy) {
|
||||||
return;
|
return;
|
||||||
@@ -908,11 +917,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
CUstream stream,
|
CUstream stream,
|
||||||
size_t profile_budget_bytes,
|
size_t profile_budget_bytes,
|
||||||
struct prepared_profile *out) {
|
struct prepared_profile *out) {
|
||||||
memset(out, 0, sizeof(*out));
|
|
||||||
out->desc = *desc;
|
|
||||||
out->stream = stream;
|
|
||||||
|
|
||||||
size_t bytes_per_cell = 0;
|
size_t bytes_per_cell = 0;
|
||||||
|
size_t attempt_budget = profile_budget_bytes;
|
||||||
|
|
||||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||||
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
||||||
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
||||||
@@ -921,106 +928,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
|
while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
|
||||||
out->m = dim;
|
memset(out, 0, sizeof(*out));
|
||||||
out->n = dim;
|
out->desc = *desc;
|
||||||
out->k = dim;
|
out->stream = stream;
|
||||||
|
|
||||||
size_t desired_workspace = profile_budget_bytes / 8u;
|
uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
|
||||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
out->m = dim;
|
||||||
desired_workspace = 32u * 1024u * 1024u;
|
out->n = dim;
|
||||||
}
|
out->k = dim;
|
||||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
|
||||||
|
|
||||||
size_t a_bytes = 0;
|
size_t desired_workspace = attempt_budget / 8u;
|
||||||
size_t b_bytes = 0;
|
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||||
size_t c_bytes = 0;
|
desired_workspace = 32u * 1024u * 1024u;
|
||||||
size_t d_bytes = 0;
|
}
|
||||||
size_t scale_bytes = 0;
|
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||||
while (1) {
|
|
||||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
|
||||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
|
||||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
|
||||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
|
||||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
|
||||||
|
|
||||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
size_t a_bytes = 0;
|
||||||
if (matrix_bytes <= profile_budget_bytes) {
|
size_t b_bytes = 0;
|
||||||
size_t remaining = profile_budget_bytes - matrix_bytes;
|
size_t c_bytes = 0;
|
||||||
out->workspace_size = desired_workspace;
|
size_t d_bytes = 0;
|
||||||
if (out->workspace_size > remaining) {
|
size_t scale_bytes = 0;
|
||||||
out->workspace_size = round_down_size(remaining, 256u);
|
while (1) {
|
||||||
|
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||||
|
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||||
|
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||||
|
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||||
|
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||||
|
|
||||||
|
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||||
|
if (matrix_bytes <= attempt_budget) {
|
||||||
|
size_t remaining = attempt_budget - matrix_bytes;
|
||||||
|
out->workspace_size = desired_workspace;
|
||||||
|
if (out->workspace_size > remaining) {
|
||||||
|
out->workspace_size = round_down_size(remaining, 256u);
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
|
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
out->m -= (uint64_t)desc->min_multiple;
|
||||||
|
out->n = out->m;
|
||||||
|
out->k = out->m;
|
||||||
|
}
|
||||||
|
if (out->m < (uint64_t)desc->min_multiple) {
|
||||||
|
attempt_budget /= 2u;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||||
return 0;
|
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||||
}
|
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||||
out->m -= (uint64_t)desc->min_multiple;
|
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||||
out->n = out->m;
|
|
||||||
out->k = out->m;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
|
||||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
|
||||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
|
||||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaDataType_t scale_type = matmul_scale_type(desc);
|
|
||||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
|
||||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
cublasOperation_t transa = CUBLAS_OP_T;
|
|
||||||
cublasOperation_t transb = CUBLAS_OP_N;
|
|
||||||
if (!check_cublas("set TRANSA",
|
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
||||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
|
||||||
&transa,
|
|
||||||
sizeof(transa))) ||
|
|
||||||
!check_cublas("set TRANSB",
|
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
||||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
|
||||||
&transb,
|
|
||||||
sizeof(transb)))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (desc->needs_scalar_scale) {
|
|
||||||
float one = 1.0f;
|
|
||||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
|
||||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
|
||||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
cudaDataType_t scale_type = matmul_scale_type(desc);
|
||||||
|
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||||
|
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
|
||||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
cublasOperation_t transa = CUBLAS_OP_T;
|
||||||
if (!check_cublas("set A scale ptr",
|
cublasOperation_t transb = CUBLAS_OP_N;
|
||||||
|
if (!check_cublas("set TRANSA",
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||||
&a_scale_ptr,
|
&transa,
|
||||||
sizeof(a_scale_ptr))) ||
|
sizeof(transa))) ||
|
||||||
!check_cublas("set B scale ptr",
|
!check_cublas("set TRANSB",
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||||
&b_scale_ptr,
|
&transb,
|
||||||
sizeof(b_scale_ptr)))) {
|
sizeof(transb)))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
if (desc->needs_scalar_scale) {
|
||||||
|
float one = 1.0f;
|
||||||
|
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||||
|
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||||
|
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||||
|
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||||
|
if (!check_cublas("set A scale ptr",
|
||||||
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
|
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||||
|
&a_scale_ptr,
|
||||||
|
sizeof(a_scale_ptr))) ||
|
||||||
|
!check_cublas("set B scale ptr",
|
||||||
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
|
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||||
|
&b_scale_ptr,
|
||||||
|
sizeof(b_scale_ptr)))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
||||||
if (desc->needs_block_scale) {
|
if (desc->needs_block_scale) {
|
||||||
@@ -1060,62 +1076,65 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!check_cublas("create A layout",
|
if (!check_cublas("create A layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||||
!check_cublas("create B layout",
|
!check_cublas("create B layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||||
!check_cublas("create C layout",
|
!check_cublas("create C layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||||
!check_cublas("create D layout",
|
!check_cublas("create D layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (out->workspace_size > 0) {
|
|
||||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (out->workspace_size > 0) {
|
||||||
|
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!check_cublas("set workspace",
|
||||||
|
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||||
|
out->preference,
|
||||||
|
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||||
|
&out->workspace_size,
|
||||||
|
sizeof(out->workspace_size)))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int found = 0;
|
||||||
|
if (check_cublas("heuristic",
|
||||||
|
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||||
|
out->op_desc,
|
||||||
|
out->a_layout,
|
||||||
|
out->b_layout,
|
||||||
|
out->c_layout,
|
||||||
|
out->d_layout,
|
||||||
|
out->preference,
|
||||||
|
1,
|
||||||
|
&out->heuristic,
|
||||||
|
&found)) &&
|
||||||
|
found > 0) {
|
||||||
|
out->ready = 1;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
|
||||||
|
if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!check_cublas("set workspace",
|
return 0;
|
||||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
|
||||||
out->preference,
|
|
||||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
|
||||||
&out->workspace_size,
|
|
||||||
sizeof(out->workspace_size)))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int found = 0;
|
|
||||||
if (!check_cublas("heuristic",
|
|
||||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
|
||||||
out->op_desc,
|
|
||||||
out->a_layout,
|
|
||||||
out->b_layout,
|
|
||||||
out->c_layout,
|
|
||||||
out->d_layout,
|
|
||||||
out->preference,
|
|
||||||
1,
|
|
||||||
&out->heuristic,
|
|
||||||
&found))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (found <= 0) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
out->ready = 1;
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int run_cublas_profile(cublasLtHandle_t handle,
|
static int run_cublas_profile(cublasLtHandle_t handle,
|
||||||
@@ -1180,6 +1199,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
size_t requested_budget = 0;
|
size_t requested_budget = 0;
|
||||||
size_t total_budget = 0;
|
size_t total_budget = 0;
|
||||||
size_t per_profile_budget = 0;
|
size_t per_profile_budget = 0;
|
||||||
|
int budget_profiles = 0;
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||||
@@ -1215,8 +1235,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Count all profiles active on this GPU regardless of filter.
|
/* Count all profiles active on this GPU regardless of filter.
|
||||||
* Used as the budget divisor so matrix sizes stay consistent whether
|
* Mixed phases still divide budget across the full precision set, while
|
||||||
* running all precisions together or a single-precision phase. */
|
* single-precision benchmark phases dedicate budget only to active
|
||||||
|
* profiles matching precision_filter. */
|
||||||
int planned_total = 0;
|
int planned_total = 0;
|
||||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
||||||
@@ -1226,19 +1247,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
if (planned_total < planned) {
|
if (planned_total < planned) {
|
||||||
planned_total = planned;
|
planned_total = planned;
|
||||||
}
|
}
|
||||||
|
budget_profiles = planned_total;
|
||||||
|
if (precision_filter != NULL) {
|
||||||
|
budget_profiles = planned;
|
||||||
|
}
|
||||||
|
if (budget_profiles <= 0) {
|
||||||
|
budget_profiles = planned_total;
|
||||||
|
}
|
||||||
|
|
||||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||||
if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||||
if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||||
cuda->cuStreamCreate &&
|
cuda->cuStreamCreate &&
|
||||||
cuda->cuStreamDestroy) {
|
cuda->cuStreamDestroy) {
|
||||||
stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
|
stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
|
||||||
|
}
|
||||||
|
if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
|
||||||
|
stream_count = MAX_SINGLE_PRECISION_STREAMS;
|
||||||
}
|
}
|
||||||
if (stream_count > 1) {
|
if (stream_count > 1) {
|
||||||
int created = 0;
|
int created = 0;
|
||||||
@@ -1251,18 +1282,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
report->stream_count = stream_count;
|
report->stream_count = stream_count;
|
||||||
per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
|
per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
|
||||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
|
if (precision_filter != NULL) {
|
||||||
|
per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
|
||||||
|
}
|
||||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
mp_count,
|
mp_count,
|
||||||
|
budget_profiles,
|
||||||
per_profile_budget / (1024u * 1024u));
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
for (int i = 0; i < profile_count; i++) {
|
for (int i = 0; i < profile_count; i++) {
|
||||||
|
|||||||
@@ -161,6 +161,7 @@ run_variant() {
|
|||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-e BEE_REQUIRE_MEMTEST=1 \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
"${IMAGE_REF}" \
|
||||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||||
@@ -175,6 +176,7 @@ run_variant() {
|
|||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-e BEE_REQUIRE_MEMTEST=1 \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
"${IMAGE_REF}" \
|
||||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
|||||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
|
export MEMTEST_VERSION
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
@@ -775,6 +776,7 @@ run_optional_step_sh() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
mkdir -p "${LOG_DIR}" 2>/dev/null || true
|
||||||
step_log="${LOG_DIR}/${step_slug}.log"
|
step_log="${LOG_DIR}/${step_slug}.log"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== optional step: ${step_name} ==="
|
echo "=== optional step: ${step_name} ==="
|
||||||
@@ -798,13 +800,14 @@ start_build_log
|
|||||||
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||||
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
||||||
echo "=== refreshing apt index to detect current kernel ABI ==="
|
echo "=== refreshing apt index to detect current kernel ABI ==="
|
||||||
apt-get update -qq
|
apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
|
||||||
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
||||||
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
||||||
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
||||||
| head -1)
|
| head -1)
|
||||||
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
||||||
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
||||||
|
echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
||||||
@@ -1259,6 +1262,7 @@ fi
|
|||||||
# --- substitute version placeholders in package list and archive ---
|
# --- substitute version placeholders in package list and archive ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
sed -i \
|
sed -i \
|
||||||
|
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
@@ -1301,7 +1305,7 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
|||||||
export BEE_GPU_VENDOR_UPPER
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
|
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
|||||||
# Enable GPU-vendor specific services
|
# Enable GPU-vendor specific services
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||||
systemctl enable bee-nvidia.service
|
systemctl enable bee-nvidia.service
|
||||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ set -e
|
|||||||
|
|
||||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
|
# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
|
||||||
|
# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
|
||||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||||
BINARY_BOOT_DIR="binary/boot"
|
BINARY_BOOT_DIR="binary/boot"
|
||||||
GRUB_CFG="binary/boot/grub/grub.cfg"
|
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||||
@@ -24,15 +26,23 @@ fail_or_warn() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
|
||||||
|
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
|
||||||
|
# The template already has memtest entries hardcoded, so a missing config file
|
||||||
|
# here is not an error; validate_iso_memtest() checks the final ISO instead.
|
||||||
|
warn_only() {
|
||||||
|
log "WARNING: $1"
|
||||||
|
}
|
||||||
|
|
||||||
copy_memtest_file() {
|
copy_memtest_file() {
|
||||||
src="$1"
|
src="$1"
|
||||||
base="$(basename "$src")"
|
dst_name="${2:-$(basename "$src")}"
|
||||||
dst="${BINARY_BOOT_DIR}/${base}"
|
dst="${BINARY_BOOT_DIR}/${dst_name}"
|
||||||
|
|
||||||
[ -f "$src" ] || return 1
|
[ -f "$src" ] || return 1
|
||||||
mkdir -p "${BINARY_BOOT_DIR}"
|
mkdir -p "${BINARY_BOOT_DIR}"
|
||||||
cp "$src" "$dst"
|
cp "$src" "$dst"
|
||||||
log "copied ${base} from ${src}"
|
log "copied ${dst_name} from ${src}"
|
||||||
}
|
}
|
||||||
|
|
||||||
extract_memtest_from_deb() {
|
extract_memtest_from_deb() {
|
||||||
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {
|
|||||||
|
|
||||||
log "extracting memtest payload from ${deb}"
|
log "extracting memtest payload from ${deb}"
|
||||||
dpkg-deb -x "$deb" "$tmpdir"
|
dpkg-deb -x "$deb" "$tmpdir"
|
||||||
for f in ${MEMTEST_FILES}; do
|
|
||||||
if [ -f "${tmpdir}/boot/${f}" ]; then
|
# EFI binary: both 5.x and 6.x use memtest86+x64.efi
|
||||||
copy_memtest_file "${tmpdir}/boot/${f}"
|
if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
|
||||||
fi
|
copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
|
||||||
done
|
fi
|
||||||
|
|
||||||
|
# BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
|
||||||
|
if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
|
||||||
|
copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
|
||||||
|
elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
|
||||||
|
copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
|
||||||
|
fi
|
||||||
|
|
||||||
rm -rf "$tmpdir"
|
rm -rf "$tmpdir"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
download_and_extract_memtest() {
|
||||||
|
tmpdl="$(mktemp -d)"
|
||||||
|
if [ -n "${MEMTEST_VERSION:-}" ]; then
|
||||||
|
pkg_spec="memtest86+=${MEMTEST_VERSION}"
|
||||||
|
else
|
||||||
|
pkg_spec="memtest86+"
|
||||||
|
fi
|
||||||
|
log "downloading ${pkg_spec} from apt"
|
||||||
|
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
|
||||||
|
log "apt download failed, retrying after apt-get update"
|
||||||
|
apt-get update -qq >/dev/null 2>&1 || true
|
||||||
|
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
|
||||||
|
fi
|
||||||
|
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||||
|
if [ -n "$deb" ]; then
|
||||||
|
extract_memtest_from_deb "$deb"
|
||||||
|
else
|
||||||
|
log "apt download of memtest86+ failed"
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdl"
|
||||||
|
}
|
||||||
|
|
||||||
ensure_memtest_binaries() {
|
ensure_memtest_binaries() {
|
||||||
missing=0
|
missing=0
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
|
|||||||
done
|
done
|
||||||
[ "$missing" -eq 1 ] || return 0
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 1. Try files already placed by lb binary_memtest or chroot
|
||||||
for root in chroot/boot /boot; do
|
for root in chroot/boot /boot; do
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||||
done
|
done
|
||||||
|
# 6.x BIOS binary may lack x64 in name — copy with normalised name
|
||||||
|
if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
|
||||||
|
copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
missing=0
|
missing=0
|
||||||
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
|
|||||||
done
|
done
|
||||||
[ "$missing" -eq 1 ] || return 0
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 2. Try apt package cache (may be empty if lb binary_memtest already purged)
|
||||||
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||||
[ -d "$root" ] || continue
|
[ -d "$root" ] || continue
|
||||||
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||||
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
|
|||||||
break
|
break
|
||||||
done
|
done
|
||||||
|
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
|
||||||
|
download_and_extract_memtest
|
||||||
|
|
||||||
missing=0
|
missing=0
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||||
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {
|
|||||||
|
|
||||||
ensure_grub_entry() {
|
ensure_grub_entry() {
|
||||||
[ -f "$GRUB_CFG" ] || {
|
[ -f "$GRUB_CFG" ] || {
|
||||||
fail_or_warn "missing ${GRUB_CFG}"
|
warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,7 +169,7 @@ EOF
|
|||||||
|
|
||||||
ensure_isolinux_entry() {
|
ensure_isolinux_entry() {
|
||||||
[ -f "$ISOLINUX_CFG" ] || {
|
[ -f "$ISOLINUX_CFG" ] || {
|
||||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||||
# explicitly.
|
# explicitly.
|
||||||
|
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
|||||||
@@ -258,6 +258,22 @@ else
|
|||||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||||
|
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||||
|
# training completes under nvidia-fabricmanager.
|
||||||
|
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||||
|
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||||
|
log "nvidia-fabricmanager restarted"
|
||||||
|
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||||
|
log "nvidia-fabricmanager started"
|
||||||
|
else
|
||||||
|
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||||
|
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nvidia-fabricmanager.service not installed"
|
||||||
|
fi
|
||||||
|
|
||||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||||
|
|||||||
@@ -9,9 +9,9 @@ xset s noblank
|
|||||||
|
|
||||||
# Set desktop background.
|
# Set desktop background.
|
||||||
if [ -f /usr/share/bee/wallpaper.png ]; then
|
if [ -f /usr/share/bee/wallpaper.png ]; then
|
||||||
feh --bg-fill /usr/share/bee/wallpaper.png
|
feh --bg-center --image-bg '#000000' /usr/share/bee/wallpaper.png
|
||||||
else
|
else
|
||||||
xsetroot -solid '#f6c90e'
|
xsetroot -solid '#000000'
|
||||||
fi
|
fi
|
||||||
|
|
||||||
tint2 &
|
tint2 &
|
||||||
|
|||||||
BIN
iso/overlay/usr/share/bee/wallpaper.png
Normal file
BIN
iso/overlay/usr/share/bee/wallpaper.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
Reference in New Issue
Block a user