Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5c1862ce4c | ||
|
|
b65ef2ea1d | ||
|
|
533d703c97 | ||
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 |
@@ -5,22 +5,18 @@ go 1.25.0
|
|||||||
replace reanimator/chart => ../internal/chart
|
replace reanimator/chart => ../internal/chart
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/go-analyze/charts v0.5.26
|
modernc.org/sqlite v1.48.0
|
||||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
golang.org/x/image v0.24.0 // indirect
|
|
||||||
golang.org/x/sys v0.42.0 // indirect
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
modernc.org/libc v1.70.0 // indirect
|
modernc.org/libc v1.72.0 // indirect
|
||||||
modernc.org/mathutil v1.7.1 // indirect
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
modernc.org/memory v1.11.0 // indirect
|
modernc.org/memory v1.11.0 // indirect
|
||||||
modernc.org/sqlite v1.48.0 // indirect
|
|
||||||
)
|
)
|
||||||
|
|||||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
|||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
|
||||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
|
||||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||||
|
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||||
|
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||||
|
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||||
|
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||||
|
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||||
|
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||||
|
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||||
|
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||||
|
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||||
|
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||||
|
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||||
|
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||||
|
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||||
|
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||||
|
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||||
|
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||||
|
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||||
|
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||||
|
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
|
|||||||
"bee-selfheal.service",
|
"bee-selfheal.service",
|
||||||
"bee-selfheal.timer",
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
|
"nvidia-dcgm.service",
|
||||||
|
"nvidia-fabricmanager.service",
|
||||||
}
|
}
|
||||||
|
|
||||||
var supportBundleCommands = []struct {
|
var supportBundleCommands = []struct {
|
||||||
@@ -48,6 +50,43 @@ else
|
|||||||
fi
|
fi
|
||||||
`}},
|
`}},
|
||||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
nvidia-smi topo -m 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "nvidia-smi not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v systemctl >/dev/null 2>&1; then
|
||||||
|
echo "systemctl not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo "=== unit files ==="
|
||||||
|
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== active units ==="
|
||||||
|
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== failed units ==="
|
||||||
|
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||||
|
`}},
|
||||||
|
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||||
|
for candidate in \
|
||||||
|
/usr/bin/nvidia-fabricmanager \
|
||||||
|
/usr/bin/nv-fabricmanager \
|
||||||
|
/usr/bin/nvidia-fabricmanagerd \
|
||||||
|
/usr/bin/nvlsm; do
|
||||||
|
if [ -e "$candidate" ]; then
|
||||||
|
echo "=== $candidate ==="
|
||||||
|
ls -l "$candidate" 2>&1 || true
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||||
|
echo "no fabric manager binaries found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||||
if ! command -v lspci >/dev/null 2>&1; then
|
if ! command -v lspci >/dev/null 2>&1; then
|
||||||
echo "lspci not found"
|
echo "lspci not found"
|
||||||
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
|
|||||||
}{
|
}{
|
||||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
|
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||||
|
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||||
|
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||||
|
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||||
}
|
}
|
||||||
|
|
||||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||||
|
|||||||
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
|
|||||||
"bee-audit",
|
"bee-audit",
|
||||||
"bee-web",
|
"bee-web",
|
||||||
"bee-sshsetup",
|
"bee-sshsetup",
|
||||||
|
"nvidia-dcgm",
|
||||||
|
"nvidia-fabricmanager",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
DEBIAN_VERSION=12
|
DEBIAN_VERSION=12
|
||||||
DEBIAN_KERNEL_ABI=auto
|
DEBIAN_KERNEL_ABI=auto
|
||||||
NVIDIA_DRIVER_VERSION=590.48.01
|
NVIDIA_DRIVER_VERSION=590.48.01
|
||||||
|
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||||
NCCL_VERSION=2.28.9-1
|
NCCL_VERSION=2.28.9-1
|
||||||
NCCL_CUDA_VERSION=13.0
|
NCCL_CUDA_VERSION=13.0
|
||||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ lb config noauto \
|
|||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
|
--debootstrap-options "--include=ca-certificates" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -1262,6 +1262,7 @@ fi
|
|||||||
# --- substitute version placeholders in package list and archive ---
|
# --- substitute version placeholders in package list and archive ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
sed -i \
|
sed -i \
|
||||||
|
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
@@ -1304,7 +1305,7 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
|||||||
export BEE_GPU_VENDOR_UPPER
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
|
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
|||||||
# Enable GPU-vendor specific services
|
# Enable GPU-vendor specific services
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||||
systemctl enable bee-nvidia.service
|
systemctl enable bee-nvidia.service
|
||||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||||
# explicitly.
|
# explicitly.
|
||||||
|
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
|||||||
@@ -258,6 +258,22 @@ else
|
|||||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||||
|
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||||
|
# training completes under nvidia-fabricmanager.
|
||||||
|
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||||
|
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||||
|
log "nvidia-fabricmanager restarted"
|
||||||
|
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||||
|
log "nvidia-fabricmanager started"
|
||||||
|
else
|
||||||
|
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||||
|
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nvidia-fabricmanager.service not installed"
|
||||||
|
fi
|
||||||
|
|
||||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||||
|
|||||||
Reference in New Issue
Block a user