Compare commits
53 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d | |||
| a6a07f2626 | |||
| f87461ee4a | |||
| a636146dbd | |||
|
|
303de2df04 | ||
|
|
95124d228f | ||
|
|
54338dbae5 | ||
|
|
2be7ae6d28 | ||
|
|
b1a5035edd | ||
|
|
8fc986c933 | ||
|
|
88b5e0edf2 | ||
|
|
82fe1f6d26 | ||
| 81e7c921f8 | |||
| 0fb8f2777f | |||
| bf182daa89 | |||
| 457ea1cf04 | |||
| bf6ecab4f0 | |||
| 02e44b1172 | |||
| 2ceaa0d0ca | |||
| 9482ba20a2 | |||
| 813e2f86a9 | |||
| 58a6da9b44 | |||
| f4a19c0a00 | |||
| 9e3dcf9b4d | |||
| 098e19f760 | |||
| e16d0f34b5 | |||
|
|
525ed8b8fc | ||
|
|
4f94ebcb2c | ||
|
|
05c1fde233 | ||
| 825ef6b98a | |||
| ba16021cdb | |||
|
|
bb1218ddd4 | ||
|
|
65faae8ede | ||
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
dist/
|
dist/
|
||||||
iso/out/
|
iso/out/
|
||||||
|
build-cache/
|
||||||
|
|||||||
@@ -5,22 +5,18 @@ go 1.25.0
|
|||||||
replace reanimator/chart => ../internal/chart
|
replace reanimator/chart => ../internal/chart
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/go-analyze/charts v0.5.26
|
modernc.org/sqlite v1.48.0
|
||||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
golang.org/x/image v0.24.0 // indirect
|
|
||||||
golang.org/x/sys v0.42.0 // indirect
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
modernc.org/libc v1.70.0 // indirect
|
modernc.org/libc v1.72.0 // indirect
|
||||||
modernc.org/mathutil v1.7.1 // indirect
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
modernc.org/memory v1.11.0 // indirect
|
modernc.org/memory v1.11.0 // indirect
|
||||||
modernc.org/sqlite v1.48.0 // indirect
|
|
||||||
)
|
)
|
||||||
|
|||||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
|||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
|
||||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
|
||||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||||
|
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||||
|
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||||
|
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||||
|
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||||
|
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||||
|
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||||
|
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||||
|
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||||
|
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||||
|
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||||
|
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||||
|
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||||
|
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||||
|
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||||
|
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||||
|
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||||
|
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||||
|
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||||
|
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||||
|
|||||||
@@ -30,7 +30,9 @@ var (
|
|||||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||||
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
|
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||||
|
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||||
|
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||||
)
|
)
|
||||||
|
|
||||||
type App struct {
|
type App struct {
|
||||||
@@ -84,6 +86,7 @@ type installer interface {
|
|||||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||||
IsLiveMediaInRAM() bool
|
IsLiveMediaInRAM() bool
|
||||||
LiveBootSource() platform.LiveBootSource
|
LiveBootSource() platform.LiveBootSource
|
||||||
|
LiveMediaRAMState() platform.LiveMediaRAMState
|
||||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,6 +111,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
|
|||||||
return a.installer.LiveBootSource()
|
return a.installer.LiveBootSource()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
|
||||||
|
return a.installer.LiveMediaRAMState()
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||||
}
|
}
|
||||||
@@ -117,7 +124,8 @@ type satRunner interface {
|
|||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
@@ -190,6 +198,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
}
|
}
|
||||||
result := collector.Run(runtimeMode)
|
result := collector.Run(runtimeMode)
|
||||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||||
|
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||||
result.Runtime = &health
|
result.Runtime = &health
|
||||||
}
|
}
|
||||||
@@ -561,16 +570,23 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp
|
|||||||
|
|
||||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultBenchmarkBaseDir
|
baseDir = DefaultBeeBenchPerfDir
|
||||||
}
|
}
|
||||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchPowerDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -926,6 +942,41 @@ func bodyOr(body, fallback string) string {
|
|||||||
return body
|
return body
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||||
|
// component-status DB so they are visible in the Hardware Summary card.
|
||||||
|
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||||
|
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||||
|
if db == nil || len(psus) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const source = "audit:ipmi"
|
||||||
|
worstStatus := "OK"
|
||||||
|
for _, psu := range psus {
|
||||||
|
if psu.Status == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
slot := "?"
|
||||||
|
if psu.Slot != nil {
|
||||||
|
slot = *psu.Slot
|
||||||
|
}
|
||||||
|
st := *psu.Status
|
||||||
|
detail := ""
|
||||||
|
if psu.ErrorDescription != nil {
|
||||||
|
detail = *psu.ErrorDescription
|
||||||
|
}
|
||||||
|
db.Record("psu:"+slot, source, st, detail)
|
||||||
|
switch st {
|
||||||
|
case "Critical":
|
||||||
|
worstStatus = "Critical"
|
||||||
|
case "Warning":
|
||||||
|
if worstStatus != "Critical" {
|
||||||
|
worstStatus = "Warning"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
db.Record("psu:all", source, worstStatus, "")
|
||||||
|
}
|
||||||
|
|
||||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||||
raw, err := os.ReadFile(path)
|
raw, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -122,6 +122,7 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
|||||||
type fakeSAT struct {
|
type fakeSAT struct {
|
||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
|
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||||
@@ -154,6 +155,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPowerBenchFn != nil {
|
||||||
|
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaTargetedStressFn != nil {
|
if f.runNvidiaTargetedStressFn != nil {
|
||||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||||
@@ -161,7 +169,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaComputeFn != nil {
|
if f.runNvidiaComputeFn != nil {
|
||||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||||
}
|
}
|
||||||
@@ -542,8 +550,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -580,8 +586,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -643,8 +647,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
oldSATBaseDir := DefaultSATBaseDir
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
|
|||||||
"bee-selfheal.service",
|
"bee-selfheal.service",
|
||||||
"bee-selfheal.timer",
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
|
"nvidia-dcgm.service",
|
||||||
|
"nvidia-fabricmanager.service",
|
||||||
}
|
}
|
||||||
|
|
||||||
var supportBundleCommands = []struct {
|
var supportBundleCommands = []struct {
|
||||||
@@ -48,13 +50,50 @@ else
|
|||||||
fi
|
fi
|
||||||
`}},
|
`}},
|
||||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
nvidia-smi topo -m 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "nvidia-smi not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v systemctl >/dev/null 2>&1; then
|
||||||
|
echo "systemctl not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo "=== unit files ==="
|
||||||
|
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== active units ==="
|
||||||
|
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== failed units ==="
|
||||||
|
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||||
|
`}},
|
||||||
|
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||||
|
for candidate in \
|
||||||
|
/usr/bin/nvidia-fabricmanager \
|
||||||
|
/usr/bin/nv-fabricmanager \
|
||||||
|
/usr/bin/nvidia-fabricmanagerd \
|
||||||
|
/usr/bin/nvlsm; do
|
||||||
|
if [ -e "$candidate" ]; then
|
||||||
|
echo "=== $candidate ==="
|
||||||
|
ls -l "$candidate" 2>&1 || true
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||||
|
echo "no fabric manager binaries found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||||
if ! command -v lspci >/dev/null 2>&1; then
|
if ! command -v lspci >/dev/null 2>&1; then
|
||||||
echo "lspci not found"
|
echo "lspci not found"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
found=0
|
found=0
|
||||||
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||||
found=1
|
found=1
|
||||||
echo "=== GPU $gpu ==="
|
echo "=== GPU $gpu ==="
|
||||||
lspci -s "$gpu" -vv 2>&1 || true
|
lspci -s "$gpu" -vv 2>&1 || true
|
||||||
@@ -73,8 +112,13 @@ fi
|
|||||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||||
for d in /sys/bus/pci/devices/*/; do
|
for d in /sys/bus/pci/devices/*/; do
|
||||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||||
[ "$vendor" = "0x10de" ] || continue
|
[ "$vendor" = "0x10de" ] || continue
|
||||||
dev=$(basename "$d")
|
class=$(cat "$d/class" 2>/dev/null)
|
||||||
|
case "$class" in
|
||||||
|
0x030000|0x030200) ;;
|
||||||
|
*) continue ;;
|
||||||
|
esac
|
||||||
|
dev=$(basename "$d")
|
||||||
echo "=== $dev ==="
|
echo "=== $dev ==="
|
||||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||||
@@ -190,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
|
|||||||
}{
|
}{
|
||||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
|
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||||
|
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||||
|
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||||
|
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||||
}
|
}
|
||||||
|
|
||||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||||
@@ -208,7 +256,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
|
|
||||||
now := time.Now().UTC()
|
now := time.Now().UTC()
|
||||||
date := now.Format("2006-01-02")
|
date := now.Format("2006-01-02")
|
||||||
tod := now.Format("15:04:05")
|
tod := now.Format("150405")
|
||||||
ver := bundleVersion()
|
ver := bundleVersion()
|
||||||
model := serverModelForBundle()
|
model := serverModelForBundle()
|
||||||
sn := serverSerialForBundle()
|
sn := serverSerialForBundle()
|
||||||
|
|||||||
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
return exec.CommandContext(ctx, name, args...).Output()
|
return exec.CommandContext(ctx, name, args...).Output()
|
||||||
}
|
}
|
||||||
|
|
||||||
func interfaceHasCarrier(iface string) bool {
|
|
||||||
raw, err := readNetCarrierFile(iface)
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return strings.TrimSpace(raw) == "1"
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if interfaceHasCarrier(iface) {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
enriched++
|
||||||
enriched++
|
continue
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
|||||||
}
|
}
|
||||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||||
val := strings.TrimSpace(trimmed[idx+1:])
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
|
case key == "identifier":
|
||||||
|
s := parseSFPIdentifier(val)
|
||||||
|
dev.SFPIdentifier = &s
|
||||||
|
t := true
|
||||||
|
dev.SFPPresent = &t
|
||||||
|
changed = true
|
||||||
|
case key == "connector":
|
||||||
|
s := parseSFPConnector(val)
|
||||||
|
dev.SFPConnector = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor name":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPVendor = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor pn":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPPartNumber = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor sn":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPSerialNumber = &s
|
||||||
|
changed = true
|
||||||
|
case strings.Contains(key, "laser wavelength"):
|
||||||
|
if f, ok := firstFloat(val); ok {
|
||||||
|
dev.SFPWavelengthNM = &f
|
||||||
|
changed = true
|
||||||
|
}
|
||||||
case strings.Contains(key, "module temperature"):
|
case strings.Contains(key, "module temperature"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
dev.SFPTemperatureC = &f
|
dev.SFPTemperatureC = &f
|
||||||
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
|||||||
return changed
|
return changed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||||
|
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||||
|
func parseSFPIdentifier(val string) string {
|
||||||
|
if s := extractParens(val); s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||||
|
// e.g. "0x07 (LC)" → "LC".
|
||||||
|
func parseSFPConnector(val string) string {
|
||||||
|
if s := extractParens(val); s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||||
|
|
||||||
|
func extractParens(s string) string {
|
||||||
|
m := parenRe.FindStringSubmatch(s)
|
||||||
|
if len(m) < 2 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(m[1])
|
||||||
|
}
|
||||||
|
|
||||||
func parseSFPDOM(raw string) map[string]any {
|
func parseSFPDOM(raw string) map[string]any {
|
||||||
dev := schema.HardwarePCIeDevice{}
|
dev := schema.HardwarePCIeDevice{}
|
||||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||||
return map[string]any{}
|
return map[string]any{}
|
||||||
}
|
}
|
||||||
out := map[string]any{}
|
out := map[string]any{}
|
||||||
|
if dev.SFPPresent != nil {
|
||||||
|
out["sfp_present"] = *dev.SFPPresent
|
||||||
|
}
|
||||||
|
if dev.SFPIdentifier != nil {
|
||||||
|
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||||
|
}
|
||||||
|
if dev.SFPConnector != nil {
|
||||||
|
out["sfp_connector"] = *dev.SFPConnector
|
||||||
|
}
|
||||||
|
if dev.SFPVendor != nil {
|
||||||
|
out["sfp_vendor"] = *dev.SFPVendor
|
||||||
|
}
|
||||||
|
if dev.SFPPartNumber != nil {
|
||||||
|
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||||
|
}
|
||||||
|
if dev.SFPSerialNumber != nil {
|
||||||
|
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||||
|
}
|
||||||
|
if dev.SFPWavelengthNM != nil {
|
||||||
|
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||||
|
}
|
||||||
if dev.SFPTemperatureC != nil {
|
if dev.SFPTemperatureC != nil {
|
||||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
|
|||||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
ethtoolModuleQuery = func(string) (string, error) {
|
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||||
t.Fatal("ethtool -m should not be called without carrier")
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
class := "EthernetController"
|
class := "EthernetController"
|
||||||
bdf := "0000:18:00.0"
|
bdf := "0000:18:00.0"
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ const nvidiaVendorID = 0x10de
|
|||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
Index int
|
Index int
|
||||||
BDF string
|
BDF string
|
||||||
|
Name string
|
||||||
Serial string
|
Serial string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
TemperatureC *float64
|
TemperatureC *float64
|
||||||
@@ -73,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if v := strings.TrimSpace(info.Name); v != "" {
|
||||||
|
devs[i].Model = &v
|
||||||
|
}
|
||||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||||
devs[i].SerialNumber = &v
|
devs[i].SerialNumber = &v
|
||||||
}
|
}
|
||||||
@@ -99,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||||
out, err := exec.Command(
|
out, err := exec.Command(
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
).Output()
|
).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -123,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
if len(rec) == 0 {
|
if len(rec) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(rec) < 13 {
|
if len(rec) < 14 {
|
||||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
|
||||||
}
|
}
|
||||||
|
|
||||||
bdf := normalizePCIeBDF(rec[1])
|
bdf := normalizePCIeBDF(rec[1])
|
||||||
@@ -135,17 +139,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
info := nvidiaGPUInfo{
|
info := nvidiaGPUInfo{
|
||||||
Index: parseRequiredInt(rec[0]),
|
Index: parseRequiredInt(rec[0]),
|
||||||
BDF: bdf,
|
BDF: bdf,
|
||||||
Serial: strings.TrimSpace(rec[2]),
|
Name: strings.TrimSpace(rec[2]),
|
||||||
VBIOS: strings.TrimSpace(rec[3]),
|
Serial: strings.TrimSpace(rec[3]),
|
||||||
TemperatureC: parseMaybeFloat(rec[4]),
|
VBIOS: strings.TrimSpace(rec[4]),
|
||||||
PowerW: parseMaybeFloat(rec[5]),
|
TemperatureC: parseMaybeFloat(rec[5]),
|
||||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
PowerW: parseMaybeFloat(rec[6]),
|
||||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
ECCUncorrected: parseMaybeInt64(rec[7]),
|
||||||
HWSlowdown: parseMaybeBool(rec[8]),
|
ECCCorrected: parseMaybeInt64(rec[8]),
|
||||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
HWSlowdown: parseMaybeBool(rec[9]),
|
||||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
|
||||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
PCIeLinkGenMax: parseMaybeInt(rec[11]),
|
||||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
PCIeLinkWidthCur: parseMaybeInt(rec[12]),
|
||||||
|
PCIeLinkWidthMax: parseMaybeInt(rec[13]),
|
||||||
}
|
}
|
||||||
result[bdf] = info
|
result[bdf] = info
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse failed: %v", err)
|
t.Fatalf("parse failed: %v", err)
|
||||||
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
|||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("gpu by normalized bdf not found")
|
t.Fatalf("gpu by normalized bdf not found")
|
||||||
}
|
}
|
||||||
|
if gpu.Name != "NVIDIA H100 80GB HBM3" {
|
||||||
|
t.Fatalf("name: got %q", gpu.Name)
|
||||||
|
}
|
||||||
if gpu.Serial != "GPU-SERIAL-1" {
|
if gpu.Serial != "GPU-SERIAL-1" {
|
||||||
t.Fatalf("serial: got %q", gpu.Serial)
|
t.Fatalf("serial: got %q", gpu.Serial)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package collector
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -79,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
||||||
|
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
||||||
|
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
||||||
|
bmcPatterns := []string{
|
||||||
|
"management system chip",
|
||||||
|
"management controller",
|
||||||
|
"ibmc",
|
||||||
|
"idrac",
|
||||||
|
"ilo vga",
|
||||||
|
"aspeed",
|
||||||
|
"matrox",
|
||||||
|
}
|
||||||
|
for _, bad := range bmcPatterns {
|
||||||
|
if strings.Contains(d, bad) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||||
internalAMDPatterns := []string{
|
internalAMDPatterns := []string{
|
||||||
"dummy function",
|
"dummy function",
|
||||||
@@ -153,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
|
|
||||||
// SVendor/SDevice available but not in schema — skip
|
// SVendor/SDevice available but not in schema — skip
|
||||||
|
|
||||||
|
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
return dev
|
return dev
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -222,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
|||||||
return value, true
|
return value, true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||||
|
// speed is below the maximum negotiated speed supported by both ends.
|
||||||
|
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||||
|
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||||
|
warn := statusWarning
|
||||||
|
dev.Status = &warn
|
||||||
|
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||||
|
dev.ErrorDescription = &desc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
|
||||||
|
// Returns 0 for unrecognised values so comparisons fail safe.
|
||||||
|
func pcieLinkSpeedRank(gen string) int {
|
||||||
|
switch gen {
|
||||||
|
case "Gen1":
|
||||||
|
return 1
|
||||||
|
case "Gen2":
|
||||||
|
return 2
|
||||||
|
case "Gen3":
|
||||||
|
return 3
|
||||||
|
case "Gen4":
|
||||||
|
return 4
|
||||||
|
case "Gen5":
|
||||||
|
return 5
|
||||||
|
case "Gen6":
|
||||||
|
return 6
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func normalizePCILinkSpeed(raw string) string {
|
func normalizePCILinkSpeed(raw string) string {
|
||||||
raw = strings.TrimSpace(strings.ToLower(raw))
|
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||||
switch {
|
switch {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -29,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
|||||||
{name: "raid", class: "RAID bus controller", want: true},
|
{name: "raid", class: "RAID bus controller", want: true},
|
||||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||||
{name: "vga", class: "VGA compatible controller", want: true},
|
{name: "vga", class: "VGA compatible controller", want: true},
|
||||||
|
{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
|
||||||
|
{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
|
||||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -139,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
|
||||||
|
ptr := func(s string) *string { return &s }
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
linkSpeed *string
|
||||||
|
maxSpeed *string
|
||||||
|
wantWarning bool
|
||||||
|
wantGenIn string // substring expected in ErrorDescription when warning
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "degraded Gen1 vs Gen5",
|
||||||
|
linkSpeed: ptr("Gen1"),
|
||||||
|
maxSpeed: ptr("Gen5"),
|
||||||
|
wantWarning: true,
|
||||||
|
wantGenIn: "Gen1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "at max Gen5",
|
||||||
|
linkSpeed: ptr("Gen5"),
|
||||||
|
maxSpeed: ptr("Gen5"),
|
||||||
|
wantWarning: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "degraded Gen4 vs Gen5",
|
||||||
|
linkSpeed: ptr("Gen4"),
|
||||||
|
maxSpeed: ptr("Gen5"),
|
||||||
|
wantWarning: true,
|
||||||
|
wantGenIn: "Gen4",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing current speed — no warning",
|
||||||
|
linkSpeed: nil,
|
||||||
|
maxSpeed: ptr("Gen5"),
|
||||||
|
wantWarning: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing max speed — no warning",
|
||||||
|
linkSpeed: ptr("Gen1"),
|
||||||
|
maxSpeed: nil,
|
||||||
|
wantWarning: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
ok := statusOK
|
||||||
|
dev.Status = &ok
|
||||||
|
dev.LinkSpeed = tt.linkSpeed
|
||||||
|
dev.MaxLinkSpeed = tt.maxSpeed
|
||||||
|
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
|
gotWarn := dev.Status != nil && *dev.Status == statusWarning
|
||||||
|
if gotWarn != tt.wantWarning {
|
||||||
|
t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
|
||||||
|
}
|
||||||
|
if tt.wantWarning {
|
||||||
|
if dev.ErrorDescription == nil {
|
||||||
|
t.Fatal("expected ErrorDescription to be set")
|
||||||
|
}
|
||||||
|
if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
|
||||||
|
t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if dev.ErrorDescription != nil {
|
||||||
|
t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -2,25 +2,15 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"regexp"
|
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||||
return renderBenchmarkReportWithCharts(result, nil)
|
return renderBenchmarkReportWithCharts(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkReportChart struct {
|
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||||
Title string
|
|
||||||
Content string
|
|
||||||
}
|
|
||||||
|
|
||||||
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
|
||||||
|
|
||||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
|
|
||||||
// ── Header ────────────────────────────────────────────────────────────────
|
// ── Header ────────────────────────────────────────────────────────────────
|
||||||
@@ -58,11 +48,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
if result.ParallelGPUs {
|
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||||
|
if result.RampRunID != "" {
|
||||||
|
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||||||
|
}
|
||||||
|
} else if result.ParallelGPUs {
|
||||||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||||
}
|
}
|
||||||
|
if result.ScalabilityScore > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||||
|
}
|
||||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
@@ -83,14 +81,32 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Methodology ───────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("## Methodology\n\n")
|
||||||
|
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
|
||||||
|
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
|
||||||
|
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
|
||||||
|
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
|
||||||
|
b.WriteString("**Compute score** is derived from two phases:\n\n")
|
||||||
|
b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
|
||||||
|
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
|
||||||
|
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
|
||||||
|
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
|
||||||
|
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
|
||||||
|
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
|
||||||
|
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
|
||||||
|
b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
|
||||||
|
b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
|
||||||
|
b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
|
||||||
|
|
||||||
// ── Scorecard table ───────────────────────────────────────────────────────
|
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||||
b.WriteString("## Scorecard\n\n")
|
b.WriteString("## Scorecard\n\n")
|
||||||
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||||
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
|
b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
name := strings.TrimSpace(gpu.Name)
|
name := strings.TrimSpace(gpu.Name)
|
||||||
if name == "" {
|
if name == "" {
|
||||||
name = "Unknown"
|
name = "Unknown GPU"
|
||||||
}
|
}
|
||||||
interconnect := "-"
|
interconnect := "-"
|
||||||
if gpu.Scores.InterconnectScore > 0 {
|
if gpu.Scores.InterconnectScore > 0 {
|
||||||
@@ -100,11 +116,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
|
synthetic := "-"
|
||||||
|
if gpu.Scores.SyntheticScore > 0 {
|
||||||
|
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||||
|
}
|
||||||
|
mixed := "-"
|
||||||
|
if gpu.Scores.MixedScore > 0 {
|
||||||
|
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||||
|
}
|
||||||
|
mixedEff := "-"
|
||||||
|
if gpu.Scores.MixedEfficiency > 0 {
|
||||||
|
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||||
gpu.Index, name,
|
gpu.Index, name,
|
||||||
gpu.Status,
|
gpu.Status,
|
||||||
gpu.Scores.CompositeScore,
|
gpu.Scores.CompositeScore,
|
||||||
gpu.Scores.ComputeScore,
|
gpu.Scores.ComputeScore,
|
||||||
|
synthetic,
|
||||||
|
mixed,
|
||||||
|
mixedEff,
|
||||||
topsPerSM,
|
topsPerSM,
|
||||||
gpu.Scores.PowerSustainScore,
|
gpu.Scores.PowerSustainScore,
|
||||||
gpu.Scores.ThermalSustainScore,
|
gpu.Scores.ThermalSustainScore,
|
||||||
@@ -139,6 +170,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
if gpu.PowerLimitW > 0 {
|
if gpu.PowerLimitW > 0 {
|
||||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||||
}
|
}
|
||||||
|
if gpu.PowerLimitDerated {
|
||||||
|
fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
|
||||||
|
}
|
||||||
|
if gpu.CalibratedPeakPowerW > 0 {
|
||||||
|
if gpu.CalibratedPeakTempC > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||||||
|
}
|
||||||
|
}
|
||||||
if gpu.LockedGraphicsClockMHz > 0 {
|
if gpu.LockedGraphicsClockMHz > 0 {
|
||||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||||
}
|
}
|
||||||
@@ -154,6 +195,38 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Per-precision stability phases.
|
||||||
|
if len(gpu.PrecisionSteady) > 0 {
|
||||||
|
b.WriteString("**Per-precision stability:**\n\n")
|
||||||
|
b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
|
||||||
|
for _, p := range gpu.PrecisionSteady {
|
||||||
|
eccCorr := "—"
|
||||||
|
eccUncorr := "—"
|
||||||
|
if !p.ECC.IsZero() {
|
||||||
|
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||||||
|
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||||||
|
}
|
||||||
|
status := p.Status
|
||||||
|
if strings.TrimSpace(status) == "" {
|
||||||
|
status = "OK"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
||||||
|
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
||||||
|
eccCorr, eccUncorr)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
} else {
|
||||||
|
// Legacy: show combined-window variance.
|
||||||
|
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||||||
|
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ECC summary
|
||||||
|
if !gpu.ECC.IsZero() {
|
||||||
|
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||||||
|
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||||||
|
}
|
||||||
|
|
||||||
// Throttle
|
// Throttle
|
||||||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||||
if throttle != "none" {
|
if throttle != "none" {
|
||||||
@@ -163,12 +236,14 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
// Precision results
|
// Precision results
|
||||||
if len(gpu.PrecisionResults) > 0 {
|
if len(gpu.PrecisionResults) > 0 {
|
||||||
b.WriteString("**Precision results:**\n\n")
|
b.WriteString("**Precision results:**\n\n")
|
||||||
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
|
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
|
||||||
for _, p := range gpu.PrecisionResults {
|
for _, p := range gpu.PrecisionResults {
|
||||||
if p.Supported {
|
if p.Supported {
|
||||||
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
|
weightStr := fmt.Sprintf("×%.3g", p.Weight)
|
||||||
|
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
|
||||||
|
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
|
||||||
} else {
|
} else {
|
||||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
|
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
@@ -229,61 +304,41 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Terminal charts (steady-state only) ───────────────────────────────────
|
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||||
if len(charts) > 0 {
|
if cooling := result.Cooling; cooling != nil {
|
||||||
b.WriteString("## Steady-State Charts\n\n")
|
b.WriteString("## Cooling\n\n")
|
||||||
for _, chart := range charts {
|
if cooling.Available {
|
||||||
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
||||||
if content == "" {
|
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
||||||
continue
|
if cooling.FanDutyCycleAvailable {
|
||||||
|
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
||||||
|
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
||||||
|
} else {
|
||||||
|
b.WriteString("| Average fan duty cycle | N/A |\n")
|
||||||
|
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
|
b.WriteString("\n")
|
||||||
|
} else {
|
||||||
|
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||||
|
}
|
||||||
|
for _, note := range cooling.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
if len(cooling.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Methodology ───────────────────────────────────────────────────────────
|
|
||||||
b.WriteString("## Methodology\n\n")
|
|
||||||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
|
|
||||||
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
|
|
||||||
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
|
||||||
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
|
|
||||||
|
|
||||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||||
b.WriteString("## Raw Files\n\n")
|
b.WriteString("## Raw Files\n\n")
|
||||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||||
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
|
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||||||
b.WriteString("- `gpu-*-warmup.log`\n")
|
|
||||||
b.WriteString("- `gpu-*-steady.log`\n")
|
|
||||||
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
|
|
||||||
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
|
|
||||||
if result.Interconnect != nil {
|
if result.Interconnect != nil {
|
||||||
b.WriteString("- `nccl-all-reduce.log`\n")
|
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||||
}
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
|
|
||||||
// cooldown charts are not useful for human review).
|
|
||||||
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
|
||||||
var charts []benchmarkReportChart
|
|
||||||
for _, idx := range gpuIndices {
|
|
||||||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
|
|
||||||
raw, err := os.ReadFile(path)
|
|
||||||
if err != nil || len(raw) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
charts = append(charts, benchmarkReportChart{
|
|
||||||
Title: fmt.Sprintf("GPU %d — Steady State", idx),
|
|
||||||
Content: string(raw),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
return charts
|
|
||||||
}
|
|
||||||
|
|
||||||
func stripANSIEscapeSequences(raw string) string {
|
|
||||||
return ansiEscapePattern.ReplaceAllString(raw, "")
|
|
||||||
}
|
|
||||||
|
|
||||||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||||
// the steady-state window. Only non-zero counters are shown. When the steady
|
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||||
// duration is unknown (0), raw seconds are shown instead.
|
// duration is unknown (0), raw seconds are shown instead.
|
||||||
@@ -323,6 +378,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
|
|||||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||||
|
|||||||
@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "default",
|
name: "default",
|
||||||
profile: "",
|
profile: "",
|
||||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stability",
|
name: "stability",
|
||||||
profile: "stability",
|
profile: "stability",
|
||||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "overnight",
|
name: "overnight",
|
||||||
profile: "overnight",
|
profile: "overnight",
|
||||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -41,6 +41,129 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
|
||||||
|
benchmarkPrecisionPhases,
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if len(labels) != 7 || len(phases) != 7 {
|
||||||
|
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
|
||||||
|
}
|
||||||
|
if basePhaseSec != 60 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 300 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||||
|
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
|
||||||
|
benchmarkPrecisionPhases,
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if basePhaseSec != 300 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 3600 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
|
||||||
|
benchmarkPrecisionPhases,
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if basePhaseSec != 3600 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 14400 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
phases := []benchmarkPlannedPhase{
|
||||||
|
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
|
||||||
|
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
|
||||||
|
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
|
||||||
|
}
|
||||||
|
rows := []GPUMetricRow{
|
||||||
|
{ElapsedSec: 5},
|
||||||
|
{ElapsedSec: 15},
|
||||||
|
{ElapsedSec: 25},
|
||||||
|
{ElapsedSec: 65},
|
||||||
|
}
|
||||||
|
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
|
||||||
|
if len(got["fp8"]) != 1 {
|
||||||
|
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
|
||||||
|
}
|
||||||
|
if len(got["fp16"]) != 1 {
|
||||||
|
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
|
||||||
|
}
|
||||||
|
if len(got["mixed"]) != 2 {
|
||||||
|
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
|
||||||
|
t.Fatalf("supported=%v", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
|
||||||
|
t.Fatalf("supported=%v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
raw string
|
||||||
|
wantStatus string
|
||||||
|
}{
|
||||||
|
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
|
||||||
|
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
|
||||||
|
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
|
||||||
|
if got != tc.wantStatus {
|
||||||
|
t.Fatalf("status=%q want %q", got, tc.wantStatus)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -65,8 +188,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
|||||||
"[gpu 0] compute_capability=9.0",
|
"[gpu 0] compute_capability=9.0",
|
||||||
"[gpu 0] backend=cublasLt",
|
"[gpu 0] backend=cublasLt",
|
||||||
"[gpu 0] duration_s=10",
|
"[gpu 0] duration_s=10",
|
||||||
|
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
|
||||||
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||||
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||||
|
"[gpu 0] int8_tensor_iterations=80",
|
||||||
"[gpu 0] fp16_tensor_iterations=200",
|
"[gpu 0] fp16_tensor_iterations=200",
|
||||||
"[gpu 0] fp8_e4m3_iterations=50",
|
"[gpu 0] fp8_e4m3_iterations=50",
|
||||||
"[gpu 0] status=OK",
|
"[gpu 0] status=OK",
|
||||||
@@ -79,15 +204,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
|||||||
if got.ComputeCapability != "9.0" {
|
if got.ComputeCapability != "9.0" {
|
||||||
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||||
}
|
}
|
||||||
if len(got.Profiles) != 2 {
|
if len(got.Profiles) != 3 {
|
||||||
t.Fatalf("profiles=%d want 2", len(got.Profiles))
|
t.Fatalf("profiles=%d want 3", len(got.Profiles))
|
||||||
}
|
}
|
||||||
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||||
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||||
}
|
}
|
||||||
|
if got.Profiles[0].Category != "fp16_bf16" {
|
||||||
|
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
|
||||||
|
}
|
||||||
if got.Profiles[1].Category != "fp8" {
|
if got.Profiles[1].Category != "fp8" {
|
||||||
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||||
}
|
}
|
||||||
|
if got.Profiles[2].Category != "int8" {
|
||||||
|
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
|
||||||
|
}
|
||||||
|
if got.Profiles[2].Weight != 0.25 {
|
||||||
|
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||||
@@ -131,6 +265,13 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
|||||||
DegradationReasons: []string{"power_capped"},
|
DegradationReasons: []string{"power_capped"},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
Cooling: &BenchmarkCoolingSummary{
|
||||||
|
Available: true,
|
||||||
|
AvgFanRPM: 9200,
|
||||||
|
FanDutyCycleAvailable: true,
|
||||||
|
AvgFanDutyCyclePct: 47.5,
|
||||||
|
P95FanDutyCyclePct: 62.0,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
report := renderBenchmarkReport(result)
|
report := renderBenchmarkReport(result)
|
||||||
@@ -140,6 +281,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
|||||||
"1176.00",
|
"1176.00",
|
||||||
"fp16_tensor",
|
"fp16_tensor",
|
||||||
"700.00",
|
"700.00",
|
||||||
|
"Cooling",
|
||||||
|
"Average fan duty cycle",
|
||||||
|
"47.5%",
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(report, needle) {
|
if !strings.Contains(report, needle) {
|
||||||
t.Fatalf("report missing %q\n%s", needle, report)
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
@@ -147,34 +291,89 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
|
report := renderBenchmarkReport(NvidiaBenchmarkResult{
|
||||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||||
OverallStatus: "OK",
|
OverallStatus: "OK",
|
||||||
SelectedGPUIndices: []int{0},
|
SelectedGPUIndices: []int{0},
|
||||||
Normalization: BenchmarkNormalization{
|
Normalization: BenchmarkNormalization{
|
||||||
Status: "full",
|
Status: "full",
|
||||||
},
|
},
|
||||||
}, []benchmarkReportChart{
|
|
||||||
{
|
|
||||||
Title: "GPU 0 Steady State",
|
|
||||||
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
|
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
"Steady-State Charts",
|
"gpu-metrics.csv",
|
||||||
"GPU 0 Steady State",
|
"gpu-metrics.html",
|
||||||
"GPU 0 chart",
|
"gpu-burn.log",
|
||||||
"42┤───",
|
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(report, needle) {
|
if !strings.Contains(report, needle) {
|
||||||
t.Fatalf("report missing %q\n%s", needle, report)
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if strings.Contains(report, "\x1b[31m") {
|
}
|
||||||
t.Fatalf("report should not contain ANSI escapes\n%s", report)
|
|
||||||
|
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
nvsmiQ := []byte(`
|
||||||
|
GPU 00000000:4E:00.0
|
||||||
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Clocks
|
||||||
|
Graphics : 2422 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 2430 MHz
|
||||||
|
SM : 2430 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
Video : 2107 MHz
|
||||||
|
|
||||||
|
GPU 00000000:4F:00.0
|
||||||
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 2430 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
`)
|
||||||
|
|
||||||
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
|
0: {Index: 0, BusID: "00000000:4E:00.0"},
|
||||||
|
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||||
|
}
|
||||||
|
|
||||||
|
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MaxMemoryClockMHz != 12481 {
|
||||||
|
t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||||
|
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
nvsmiQ := []byte(`
|
||||||
|
GPU 00000000:4E:00.0
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 9999 MHz
|
||||||
|
Memory : 9999 MHz
|
||||||
|
`)
|
||||||
|
// Already populated — must not be overwritten.
|
||||||
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
|
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
||||||
|
}
|
||||||
|
|
||||||
|
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,40 @@ package platform
|
|||||||
|
|
||||||
import "time"
|
import "time"
|
||||||
|
|
||||||
|
// BenchmarkHostConfig holds static CPU and memory configuration captured at
|
||||||
|
// benchmark start. Useful for correlating results across runs on different hardware.
|
||||||
|
type BenchmarkHostConfig struct {
|
||||||
|
CPUModel string `json:"cpu_model,omitempty"`
|
||||||
|
CPUSockets int `json:"cpu_sockets,omitempty"`
|
||||||
|
CPUCores int `json:"cpu_cores,omitempty"`
|
||||||
|
CPUThreads int `json:"cpu_threads,omitempty"`
|
||||||
|
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
|
||||||
|
// steady-state phase. High or unstable CPU load during a GPU benchmark may
|
||||||
|
// indicate a competing workload or a CPU-bound driver bottleneck.
|
||||||
|
type BenchmarkCPULoad struct {
|
||||||
|
AvgPct float64 `json:"avg_pct"`
|
||||||
|
MaxPct float64 `json:"max_pct"`
|
||||||
|
P95Pct float64 `json:"p95_pct"`
|
||||||
|
Samples int `json:"samples"`
|
||||||
|
// Status is "ok", "high", or "unstable".
|
||||||
|
Status string `json:"status"`
|
||||||
|
Note string `json:"note,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkCoolingSummary captures fan telemetry averaged across the full
|
||||||
|
// benchmark run.
|
||||||
|
type BenchmarkCoolingSummary struct {
|
||||||
|
Available bool `json:"available"`
|
||||||
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
|
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||||
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
|
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
NvidiaBenchmarkProfileStandard = "standard"
|
NvidiaBenchmarkProfileStandard = "standard"
|
||||||
NvidiaBenchmarkProfileStability = "stability"
|
NvidiaBenchmarkProfileStability = "stability"
|
||||||
@@ -14,10 +48,12 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
RunNCCL bool
|
RunNCCL bool
|
||||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
|
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||||
|
RampTotal int // total number of ramp-up steps in this run
|
||||||
|
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
type NvidiaBenchmarkResult struct {
|
type NvidiaBenchmarkResult struct {
|
||||||
BenchmarkVersion string `json:"benchmark_version"`
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
GeneratedAt time.Time `json:"generated_at"`
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
@@ -25,11 +61,18 @@ type NvidiaBenchmarkResult struct {
|
|||||||
ServerModel string `json:"server_model,omitempty"`
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile"`
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
|
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
Warnings []string `json:"warnings,omitempty"`
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
Normalization BenchmarkNormalization `json:"normalization"`
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
|
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||||
|
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||||
|
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
@@ -52,30 +95,45 @@ type BenchmarkNormalizationGPU struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkGPUResult struct {
|
type BenchmarkGPUResult struct {
|
||||||
Index int `json:"index"`
|
Index int `json:"index"`
|
||||||
UUID string `json:"uuid,omitempty"`
|
UUID string `json:"uuid,omitempty"`
|
||||||
Name string `json:"name,omitempty"`
|
Name string `json:"name,omitempty"`
|
||||||
BusID string `json:"bus_id,omitempty"`
|
BusID string `json:"bus_id,omitempty"`
|
||||||
VBIOS string `json:"vbios,omitempty"`
|
VBIOS string `json:"vbios,omitempty"`
|
||||||
ComputeCapability string `json:"compute_capability,omitempty"`
|
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||||
Backend string `json:"backend,omitempty"`
|
Backend string `json:"backend,omitempty"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
// dcgmi targeted_power calibration run before the main benchmark.
|
||||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
// Used as the reference denominator for PowerSustainScore instead of
|
||||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||||
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
|
||||||
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
|
||||||
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||||
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||||
Scores BenchmarkScorecard `json:"scores"`
|
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||||
|
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||||
|
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||||
|
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
|
||||||
|
PrecisionFailures []string `json:"precision_failures,omitempty"`
|
||||||
|
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||||
|
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||||
|
// ECC error delta accumulated over the full benchmark (all phases combined).
|
||||||
|
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||||
|
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||||
|
Scores BenchmarkScorecard `json:"scores"`
|
||||||
|
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// CoolingWarning is non-empty when a thermal throttle event occurred with
|
||||||
|
// a clock drop ≥20% while server fans were not at 100% duty cycle.
|
||||||
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkTelemetrySummary struct {
|
type BenchmarkTelemetrySummary struct {
|
||||||
@@ -105,6 +163,18 @@ type BenchmarkThrottleCounters struct {
|
|||||||
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
|
||||||
|
// Corrected = single-bit errors fixed by ECC (DRAM degradation).
|
||||||
|
// Uncorrected = double-bit errors that could not be corrected (serious fault).
|
||||||
|
// Both are volatile (since last driver reset), not persistent.
|
||||||
|
type BenchmarkECCCounters struct {
|
||||||
|
Corrected uint64 `json:"corrected"`
|
||||||
|
Uncorrected uint64 `json:"uncorrected"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
|
||||||
|
func (e BenchmarkECCCounters) IsZero() bool { return e.Corrected == 0 && e.Uncorrected == 0 }
|
||||||
|
|
||||||
type BenchmarkPrecisionResult struct {
|
type BenchmarkPrecisionResult struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Category string `json:"category"`
|
Category string `json:"category"`
|
||||||
@@ -115,19 +185,31 @@ type BenchmarkPrecisionResult struct {
|
|||||||
K uint64 `json:"k,omitempty"`
|
K uint64 `json:"k,omitempty"`
|
||||||
Iterations uint64 `json:"iterations,omitempty"`
|
Iterations uint64 `json:"iterations,omitempty"`
|
||||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||||
Notes string `json:"notes,omitempty"`
|
// Weight is the fp32-equivalence factor for this precision category.
|
||||||
|
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
|
||||||
|
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||||
|
Weight float64 `json:"weight,omitempty"`
|
||||||
|
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||||
|
Notes string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkScorecard struct {
|
type BenchmarkScorecard struct {
|
||||||
ComputeScore float64 `json:"compute_score"`
|
ComputeScore float64 `json:"compute_score"`
|
||||||
|
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
|
||||||
|
// steady phases (each precision ran alone, full GPU dedicated).
|
||||||
|
SyntheticScore float64 `json:"synthetic_score,omitempty"`
|
||||||
|
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
|
||||||
|
// (all precisions competing simultaneously — closer to real workloads).
|
||||||
|
MixedScore float64 `json:"mixed_score,omitempty"`
|
||||||
|
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
|
||||||
|
// sustains throughput under concurrent mixed-precision load.
|
||||||
|
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||||
PowerSustainScore float64 `json:"power_sustain_score"`
|
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||||
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||||
StabilityScore float64 `json:"stability_score"`
|
StabilityScore float64 `json:"stability_score"`
|
||||||
InterconnectScore float64 `json:"interconnect_score"`
|
InterconnectScore float64 `json:"interconnect_score"`
|
||||||
CompositeScore float64 `json:"composite_score"`
|
CompositeScore float64 `json:"composite_score"`
|
||||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||||
// Comparable across throttle levels and GPU generations. Low value at normal
|
|
||||||
// clocks indicates silicon degradation.
|
|
||||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,6 +227,22 @@ type BenchmarkServerPower struct {
|
|||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||||
|
// during a dedicated single-precision steady window. Because only one kernel
|
||||||
|
// type runs at a time the PowerCVPct here is a genuine stability signal.
|
||||||
|
type BenchmarkPrecisionSteadyPhase struct {
|
||||||
|
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
|
||||||
|
Status string `json:"status,omitempty"`
|
||||||
|
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||||
|
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||||
|
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||||
|
// ECC errors accumulated during this precision phase only.
|
||||||
|
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
|
||||||
|
// Any uncorrected = serious fault triggered by this precision workload.
|
||||||
|
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||||
|
Notes string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type BenchmarkInterconnectResult struct {
|
type BenchmarkInterconnectResult struct {
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
Attempted bool `json:"attempted"`
|
Attempted bool `json:"attempted"`
|
||||||
@@ -156,3 +254,45 @@ type BenchmarkInterconnectResult struct {
|
|||||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchResult struct {
|
||||||
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
|
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||||
|
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||||
|
OverallStatus string `json:"overall_status"`
|
||||||
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchGPU struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
Name string `json:"name,omitempty"`
|
||||||
|
BusID string `json:"bus_id,omitempty"`
|
||||||
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||||
|
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||||
|
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||||
|
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||||
|
Derated bool `json:"derated,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||||
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchStep struct {
|
||||||
|
StepIndex int `json:"step_index"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||||
|
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||||
|
MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"`
|
||||||
|
AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"`
|
||||||
|
DeratedGPUCount int `json:"derated_gpu_count,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|||||||
@@ -13,14 +13,20 @@ import (
|
|||||||
|
|
||||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||||
type GPUMetricRow struct {
|
type GPUMetricRow struct {
|
||||||
ElapsedSec float64 `json:"elapsed_sec"`
|
Stage string `json:"stage,omitempty"`
|
||||||
GPUIndex int `json:"index"`
|
StageStartSec float64 `json:"stage_start_sec,omitempty"`
|
||||||
TempC float64 `json:"temp_c"`
|
StageEndSec float64 `json:"stage_end_sec,omitempty"`
|
||||||
UsagePct float64 `json:"usage_pct"`
|
ElapsedSec float64 `json:"elapsed_sec"`
|
||||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
GPUIndex int `json:"index"`
|
||||||
PowerW float64 `json:"power_w"`
|
TempC float64 `json:"temp_c"`
|
||||||
ClockMHz float64 `json:"clock_mhz"`
|
UsagePct float64 `json:"usage_pct"`
|
||||||
MemClockMHz float64 `json:"mem_clock_mhz"`
|
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||||
|
PowerW float64 `json:"power_w"`
|
||||||
|
ClockMHz float64 `json:"clock_mhz"`
|
||||||
|
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||||
|
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||||
|
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||||
|
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
@@ -141,14 +147,24 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
|||||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
|
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
|
||||||
for _, r := range rows {
|
for _, r := range rows {
|
||||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
|
dutyAvail := 0
|
||||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
|
if r.FanDutyCycleAvailable {
|
||||||
|
dutyAvail = 1
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
|
||||||
|
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
|
||||||
}
|
}
|
||||||
return os.WriteFile(path, b.Bytes(), 0644)
|
return os.WriteFile(path, b.Bytes(), 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type gpuMetricStageSpan struct {
|
||||||
|
Name string
|
||||||
|
Start float64
|
||||||
|
End float64
|
||||||
|
}
|
||||||
|
|
||||||
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||||
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||||
// Group by GPU index preserving order.
|
// Group by GPU index preserving order.
|
||||||
@@ -163,9 +179,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
|||||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
stageSpans := buildGPUMetricStageSpans(rows)
|
||||||
|
stageColorByName := make(map[string]string, len(stageSpans))
|
||||||
|
for i, span := range stageSpans {
|
||||||
|
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
|
||||||
|
}
|
||||||
|
|
||||||
|
var legend strings.Builder
|
||||||
|
if len(stageSpans) > 0 {
|
||||||
|
legend.WriteString(`<div class="stage-legend">`)
|
||||||
|
for _, span := range stageSpans {
|
||||||
|
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
|
||||||
|
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
|
||||||
|
}
|
||||||
|
legend.WriteString(`</div>`)
|
||||||
|
}
|
||||||
|
|
||||||
var svgs strings.Builder
|
var svgs strings.Builder
|
||||||
for _, gpuIdx := range order {
|
for _, gpuIdx := range order {
|
||||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
|
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
|
||||||
svgs.WriteString("\n")
|
svgs.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,21 +207,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
|||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
<title>GPU Stress Test Metrics</title>
|
<title>GPU Stress Test Metrics</title>
|
||||||
<style>
|
<style>
|
||||||
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
|
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
|
||||||
h1 { text-align: center; color: #333; margin: 0 0 8px; }
|
*{box-sizing:border-box}
|
||||||
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
|
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
|
||||||
|
.page{padding:24px}
|
||||||
|
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
|
||||||
|
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
|
||||||
|
.card-body{padding:16px}
|
||||||
|
h1{font-size:22px;margin:0 0 6px}
|
||||||
|
p{color:var(--muted);font-size:13px;margin:0 0 16px}
|
||||||
|
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
|
||||||
|
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
|
||||||
|
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
|
||||||
|
.chart-block{margin-top:16px}
|
||||||
</style>
|
</style>
|
||||||
</head><body>
|
</head><body>
|
||||||
|
<div class="page">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">GPU Stress Test Metrics</div>
|
||||||
|
<div class="card-body">
|
||||||
<h1>GPU Stress Test Metrics</h1>
|
<h1>GPU Stress Test Metrics</h1>
|
||||||
<p>Generated %s</p>
|
<p>Generated %s</p>
|
||||||
%s
|
%s
|
||||||
</body></html>`, ts, svgs.String())
|
<div class="chart-block">%s</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body></html>`, ts, legend.String(), svgs.String())
|
||||||
|
|
||||||
return os.WriteFile(path, []byte(html), 0644)
|
return os.WriteFile(path, []byte(html), 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
|
||||||
// Layout
|
// Layout
|
||||||
const W, H = 960, 520
|
const W, H = 960, 520
|
||||||
const plotX1 = 120 // usage axis / chart left border
|
const plotX1 = 120 // usage axis / chart left border
|
||||||
@@ -284,6 +334,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
}
|
}
|
||||||
b.WriteString("</g>\n")
|
b.WriteString("</g>\n")
|
||||||
|
|
||||||
|
// Stage backgrounds
|
||||||
|
for _, span := range stageSpans {
|
||||||
|
x1 := xv(span.Start)
|
||||||
|
x2 := xv(span.End)
|
||||||
|
if x2 < x1 {
|
||||||
|
x1, x2 = x2, x1
|
||||||
|
}
|
||||||
|
if x2-x1 < 1 {
|
||||||
|
x2 = x1 + 1
|
||||||
|
}
|
||||||
|
color := stageColorByName[span.Name]
|
||||||
|
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
|
||||||
|
x1, plotY1, x2-x1, PH, color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
|
||||||
|
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
|
||||||
|
}
|
||||||
|
|
||||||
// Chart border
|
// Chart border
|
||||||
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||||
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||||
@@ -382,221 +449,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
|
||||||
ansiAmber = "\033[38;5;214m"
|
|
||||||
ansiReset = "\033[0m"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
termChartWidth = 70
|
|
||||||
termChartHeight = 12
|
|
||||||
)
|
|
||||||
|
|
||||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
|
||||||
// Used in SAT stress-test logs.
|
|
||||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
|
||||||
seen := make(map[int]bool)
|
|
||||||
var order []int
|
|
||||||
gpuMap := make(map[int][]GPUMetricRow)
|
|
||||||
for _, r := range rows {
|
|
||||||
if !seen[r.GPUIndex] {
|
|
||||||
seen[r.GPUIndex] = true
|
|
||||||
order = append(order, r.GPUIndex)
|
|
||||||
}
|
|
||||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
|
||||||
}
|
|
||||||
|
|
||||||
type seriesDef struct {
|
|
||||||
caption string
|
|
||||||
color string
|
|
||||||
fn func(GPUMetricRow) float64
|
|
||||||
}
|
|
||||||
defs := []seriesDef{
|
|
||||||
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
|
|
||||||
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
|
||||||
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
|
|
||||||
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
|
||||||
}
|
|
||||||
|
|
||||||
var b strings.Builder
|
|
||||||
for _, gpuIdx := range order {
|
|
||||||
gr := gpuMap[gpuIdx]
|
|
||||||
if len(gr) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
|
|
||||||
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
|
|
||||||
for _, d := range defs {
|
|
||||||
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
|
|
||||||
termChartHeight, termChartWidth))
|
|
||||||
b.WriteRune('\n')
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.TrimRight(b.String(), "\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
|
||||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
|
||||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
|
||||||
if len(vals) == 0 {
|
|
||||||
return caption + "\n"
|
|
||||||
}
|
|
||||||
|
|
||||||
mn, mx := gpuMinMax(vals)
|
|
||||||
if mn == mx {
|
|
||||||
mx = mn + 1
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use the smaller of width or len(vals) to avoid stretching sparse data.
|
|
||||||
w := width
|
|
||||||
if len(vals) < w {
|
|
||||||
w = len(vals)
|
|
||||||
}
|
|
||||||
data := gpuDownsample(vals, w)
|
|
||||||
|
|
||||||
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
|
|
||||||
row := make([]int, w)
|
|
||||||
for i, v := range data {
|
|
||||||
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
|
|
||||||
if r < 0 {
|
|
||||||
r = 0
|
|
||||||
}
|
|
||||||
if r > height {
|
|
||||||
r = height
|
|
||||||
}
|
|
||||||
row[i] = r
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fill the character grid.
|
|
||||||
grid := make([][]rune, height+1)
|
|
||||||
for i := range grid {
|
|
||||||
grid[i] = make([]rune, w)
|
|
||||||
for j := range grid[i] {
|
|
||||||
grid[i][j] = ' '
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for x := 0; x < w; x++ {
|
|
||||||
r := row[x]
|
|
||||||
if x == 0 {
|
|
||||||
grid[r][0] = '─'
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
p := row[x-1]
|
|
||||||
switch {
|
|
||||||
case r == p:
|
|
||||||
grid[r][x] = '─'
|
|
||||||
case r < p: // value went up (row index decreased toward top)
|
|
||||||
grid[r][x] = '╭'
|
|
||||||
grid[p][x] = '╯'
|
|
||||||
for y := r + 1; y < p; y++ {
|
|
||||||
grid[y][x] = '│'
|
|
||||||
}
|
|
||||||
default: // r > p, value went down
|
|
||||||
grid[p][x] = '╮'
|
|
||||||
grid[r][x] = '╰'
|
|
||||||
for y := p + 1; y < r; y++ {
|
|
||||||
grid[y][x] = '│'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Y axis tick labels.
|
|
||||||
ticks := gpuNiceTicks(mn, mx, height/2)
|
|
||||||
tickAtRow := make(map[int]string)
|
|
||||||
labelWidth := 4
|
|
||||||
for _, t := range ticks {
|
|
||||||
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
|
|
||||||
if r < 0 || r > height {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
s := gpuFormatTick(t)
|
|
||||||
tickAtRow[r] = s
|
|
||||||
if len(s) > labelWidth {
|
|
||||||
labelWidth = len(s)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var b strings.Builder
|
|
||||||
for r := 0; r <= height; r++ {
|
|
||||||
label := tickAtRow[r]
|
|
||||||
fmt.Fprintf(&b, "%*s", labelWidth, label)
|
|
||||||
switch {
|
|
||||||
case label != "":
|
|
||||||
b.WriteRune('┤')
|
|
||||||
case r == height:
|
|
||||||
b.WriteRune('┼')
|
|
||||||
default:
|
|
||||||
b.WriteRune('│')
|
|
||||||
}
|
|
||||||
b.WriteString(color)
|
|
||||||
b.WriteString(string(grid[r]))
|
|
||||||
b.WriteString(ansiReset)
|
|
||||||
b.WriteRune('\n')
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bottom axis.
|
|
||||||
b.WriteString(strings.Repeat(" ", labelWidth))
|
|
||||||
b.WriteRune('└')
|
|
||||||
b.WriteString(strings.Repeat("─", w))
|
|
||||||
b.WriteRune('\n')
|
|
||||||
|
|
||||||
// Caption centered under the chart.
|
|
||||||
if caption != "" {
|
|
||||||
total := labelWidth + 1 + w
|
|
||||||
if pad := (total - len(caption)) / 2; pad > 0 {
|
|
||||||
b.WriteString(strings.Repeat(" ", pad))
|
|
||||||
}
|
|
||||||
b.WriteString(caption)
|
|
||||||
b.WriteRune('\n')
|
|
||||||
}
|
|
||||||
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
|
|
||||||
v := make([]float64, len(rows))
|
|
||||||
for i, r := range rows {
|
|
||||||
v[i] = fn(r)
|
|
||||||
}
|
|
||||||
return v
|
|
||||||
}
|
|
||||||
|
|
||||||
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
|
|
||||||
func gpuDownsample(vals []float64, w int) []float64 {
|
|
||||||
n := len(vals)
|
|
||||||
if n == 0 {
|
|
||||||
return make([]float64, w)
|
|
||||||
}
|
|
||||||
result := make([]float64, w)
|
|
||||||
if n >= w {
|
|
||||||
counts := make([]int, w)
|
|
||||||
for i, v := range vals {
|
|
||||||
bucket := i * w / n
|
|
||||||
if bucket >= w {
|
|
||||||
bucket = w - 1
|
|
||||||
}
|
|
||||||
result[bucket] += v
|
|
||||||
counts[bucket]++
|
|
||||||
}
|
|
||||||
for i := range result {
|
|
||||||
if counts[i] > 0 {
|
|
||||||
result[i] /= float64(counts[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Nearest-neighbour upsample.
|
|
||||||
for i := range result {
|
|
||||||
src := i * (n - 1) / (w - 1)
|
|
||||||
if src >= n {
|
|
||||||
src = n - 1
|
|
||||||
}
|
|
||||||
result[i] = vals[src]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
func gpuMinMax(vals []float64) (float64, float64) {
|
func gpuMinMax(vals []float64) (float64, float64) {
|
||||||
if len(vals) == 0 {
|
if len(vals) == 0 {
|
||||||
return 0, 1
|
return 0, 1
|
||||||
@@ -641,3 +493,57 @@ func gpuFormatTick(v float64) string {
|
|||||||
}
|
}
|
||||||
return strconv.FormatFloat(v, 'f', 1, 64)
|
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var gpuMetricStagePalette = []string{
|
||||||
|
"#d95c5c",
|
||||||
|
"#2185d0",
|
||||||
|
"#21ba45",
|
||||||
|
"#f2c037",
|
||||||
|
"#6435c9",
|
||||||
|
"#00b5ad",
|
||||||
|
"#a5673f",
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
|
||||||
|
var spans []gpuMetricStageSpan
|
||||||
|
for _, row := range rows {
|
||||||
|
name := strings.TrimSpace(row.Stage)
|
||||||
|
if name == "" {
|
||||||
|
name = "run"
|
||||||
|
}
|
||||||
|
start := row.StageStartSec
|
||||||
|
end := row.StageEndSec
|
||||||
|
if end <= start {
|
||||||
|
start = row.ElapsedSec
|
||||||
|
end = row.ElapsedSec
|
||||||
|
}
|
||||||
|
if len(spans) == 0 || spans[len(spans)-1].Name != name {
|
||||||
|
spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if start < spans[len(spans)-1].Start {
|
||||||
|
spans[len(spans)-1].Start = start
|
||||||
|
}
|
||||||
|
if end > spans[len(spans)-1].End {
|
||||||
|
spans[len(spans)-1].End = end
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i := range spans {
|
||||||
|
if spans[i].End <= spans[i].Start {
|
||||||
|
spans[i].End = spans[i].Start + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return spans
|
||||||
|
}
|
||||||
|
|
||||||
|
var gpuHTMLReplacer = strings.NewReplacer(
|
||||||
|
"&", "&",
|
||||||
|
"<", "<",
|
||||||
|
">", ">",
|
||||||
|
`"`, """,
|
||||||
|
"'", "'",
|
||||||
|
)
|
||||||
|
|
||||||
|
func gpuHTMLEscape(s string) string {
|
||||||
|
return gpuHTMLReplacer.Replace(s)
|
||||||
|
}
|
||||||
|
|||||||
65
audit/internal/platform/gpu_metrics_test.go
Normal file
65
audit/internal/platform/gpu_metrics_test.go
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "gpu-metrics.csv")
|
||||||
|
rows := []GPUMetricRow{
|
||||||
|
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
|
||||||
|
}
|
||||||
|
if err := WriteGPUMetricsCSV(path, rows); err != nil {
|
||||||
|
t.Fatalf("WriteGPUMetricsCSV: %v", err)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile: %v", err)
|
||||||
|
}
|
||||||
|
text := string(raw)
|
||||||
|
for _, needle := range []string{
|
||||||
|
"stage,elapsed_sec,gpu_index",
|
||||||
|
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(text, needle) {
|
||||||
|
t.Fatalf("csv missing %q\n%s", needle, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "gpu-metrics.html")
|
||||||
|
rows := []GPUMetricRow{
|
||||||
|
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
|
||||||
|
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
|
||||||
|
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
|
||||||
|
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
|
||||||
|
}
|
||||||
|
if err := WriteGPUMetricsHTML(path, rows); err != nil {
|
||||||
|
t.Fatalf("WriteGPUMetricsHTML: %v", err)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile: %v", err)
|
||||||
|
}
|
||||||
|
text := string(raw)
|
||||||
|
for _, needle := range []string{
|
||||||
|
"stage-legend",
|
||||||
|
"baseline",
|
||||||
|
"steady-fp16",
|
||||||
|
"GPU Stress Test Metrics",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(text, needle) {
|
||||||
|
t.Fatalf("html missing %q\n%s", needle, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,12 +11,10 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const installToRAMDir = "/dev/shm/bee-live"
|
||||||
|
|
||||||
func (s *System) IsLiveMediaInRAM() bool {
|
func (s *System) IsLiveMediaInRAM() bool {
|
||||||
fsType := mountFSType("/run/live/medium")
|
return s.LiveMediaRAMState().InRAM
|
||||||
if fsType == "" {
|
|
||||||
return toramActive()
|
|
||||||
}
|
|
||||||
return strings.EqualFold(fsType, "tmpfs")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) LiveBootSource() LiveBootSource {
|
func (s *System) LiveBootSource() LiveBootSource {
|
||||||
@@ -48,14 +46,95 @@ func (s *System) LiveBootSource() LiveBootSource {
|
|||||||
return status
|
return status
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
|
||||||
|
return evaluateLiveMediaRAMState(
|
||||||
|
s.LiveBootSource(),
|
||||||
|
toramActive(),
|
||||||
|
globPaths("/run/live/medium/live/*.squashfs"),
|
||||||
|
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
|
||||||
|
state := LiveMediaRAMState{
|
||||||
|
LiveBootSource: status,
|
||||||
|
ToramActive: toram,
|
||||||
|
CopyPresent: len(copiedSquashfs) > 0,
|
||||||
|
}
|
||||||
|
if status.InRAM {
|
||||||
|
state.State = "in_ram"
|
||||||
|
state.Status = "ok"
|
||||||
|
state.CopyComplete = true
|
||||||
|
state.Message = "Running from RAM — installation media can be safely disconnected."
|
||||||
|
return state
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := pathBaseSet(sourceSquashfs)
|
||||||
|
copied := pathBaseSet(copiedSquashfs)
|
||||||
|
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case state.CopyComplete:
|
||||||
|
state.State = "partial"
|
||||||
|
state.Status = "partial"
|
||||||
|
state.CanStartCopy = true
|
||||||
|
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
|
||||||
|
case state.CopyPresent:
|
||||||
|
state.State = "partial"
|
||||||
|
state.Status = "partial"
|
||||||
|
state.CanStartCopy = true
|
||||||
|
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
|
||||||
|
case toram:
|
||||||
|
state.State = "toram_failed"
|
||||||
|
state.Status = "failed"
|
||||||
|
state.CanStartCopy = true
|
||||||
|
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
|
||||||
|
default:
|
||||||
|
state.State = "not_in_ram"
|
||||||
|
state.Status = "warning"
|
||||||
|
state.CanStartCopy = true
|
||||||
|
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
|
||||||
|
}
|
||||||
|
return state
|
||||||
|
}
|
||||||
|
|
||||||
|
func globPaths(pattern string) []string {
|
||||||
|
matches, _ := filepath.Glob(pattern)
|
||||||
|
return matches
|
||||||
|
}
|
||||||
|
|
||||||
|
func pathBaseSet(paths []string) map[string]struct{} {
|
||||||
|
out := make(map[string]struct{}, len(paths))
|
||||||
|
for _, path := range paths {
|
||||||
|
base := strings.TrimSpace(filepath.Base(path))
|
||||||
|
if base != "" {
|
||||||
|
out[base] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func setContainsAll(have, want map[string]struct{}) bool {
|
||||||
|
if len(want) == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for name := range want {
|
||||||
|
if _, ok := have[name]; !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
|
||||||
log := func(msg string) {
|
log := func(msg string) {
|
||||||
if logFunc != nil {
|
if logFunc != nil {
|
||||||
logFunc(msg)
|
logFunc(msg)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.IsLiveMediaInRAM() {
|
state := s.LiveMediaRAMState()
|
||||||
|
if state.InRAM {
|
||||||
log("Already running from RAM — installation media can be safely disconnected.")
|
log("Already running from RAM — installation media can be safely disconnected.")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -80,10 +159,21 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
|||||||
humanBytes(needed+headroom), humanBytes(free))
|
humanBytes(needed+headroom), humanBytes(free))
|
||||||
}
|
}
|
||||||
|
|
||||||
dstDir := "/dev/shm/bee-live"
|
dstDir := installToRAMDir
|
||||||
|
if state.CopyPresent {
|
||||||
|
log("Removing stale partial RAM copy before retry...")
|
||||||
|
}
|
||||||
|
_ = os.RemoveAll(dstDir)
|
||||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||||
}
|
}
|
||||||
|
defer func() {
|
||||||
|
if retErr == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.RemoveAll(dstDir)
|
||||||
|
log("Removed incomplete RAM copy.")
|
||||||
|
}()
|
||||||
|
|
||||||
for _, sf := range squashfsFiles {
|
for _, sf := range squashfsFiles {
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
|
|||||||
@@ -58,3 +58,46 @@ func TestDescribeLiveBootSource(t *testing.T) {
|
|||||||
t.Fatalf("got %q want /run/live/medium", got)
|
t.Fatalf("got %q want /run/live/medium", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
t.Run("in_ram", func(t *testing.T) {
|
||||||
|
state := evaluateLiveMediaRAMState(
|
||||||
|
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
|
||||||
|
false,
|
||||||
|
nil,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
|
||||||
|
t.Fatalf("state=%+v", state)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("partial_copy_after_cancel", func(t *testing.T) {
|
||||||
|
state := evaluateLiveMediaRAMState(
|
||||||
|
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||||
|
false,
|
||||||
|
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
|
||||||
|
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
|
||||||
|
)
|
||||||
|
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
|
||||||
|
t.Fatalf("state=%+v", state)
|
||||||
|
}
|
||||||
|
if state.CopyComplete {
|
||||||
|
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("toram_failed", func(t *testing.T) {
|
||||||
|
state := evaluateLiveMediaRAMState(
|
||||||
|
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||||
|
true,
|
||||||
|
nil,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
|
||||||
|
t.Fatalf("state=%+v", state)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"bee-john-gpu-stress",
|
"bee-john-gpu-stress",
|
||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
|
|||||||
}
|
}
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||||
|
|
||||||
// Pack tar.gz
|
return runDir, nil
|
||||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
|
||||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
|
||||||
return "", fmt.Errorf("pack archive: %w", err)
|
|
||||||
}
|
|
||||||
_ = os.RemoveAll(runDir)
|
|
||||||
return archivePath, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// collectPhase samples live metrics every second until ctx is done.
|
// collectPhase samples live metrics every second until ctx is done.
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -27,6 +28,8 @@ var runtimeTrackedServices = []string{
|
|||||||
"bee-audit",
|
"bee-audit",
|
||||||
"bee-web",
|
"bee-web",
|
||||||
"bee-sshsetup",
|
"bee-sshsetup",
|
||||||
|
"nvidia-dcgm",
|
||||||
|
"nvidia-fabricmanager",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
@@ -114,6 +117,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
}
|
}
|
||||||
|
|
||||||
s.collectGPURuntimeHealth(vendor, &health)
|
s.collectGPURuntimeHealth(vendor, &health)
|
||||||
|
s.collectToRAMHealth(&health)
|
||||||
|
s.collectUSBExportHealth(&health)
|
||||||
|
|
||||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||||
health.Status = "PARTIAL"
|
health.Status = "PARTIAL"
|
||||||
@@ -168,6 +173,99 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
|||||||
return ToolStatus{Name: display}
|
return ToolStatus{Name: display}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
|
||||||
|
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
|
||||||
|
// incomplete RAM copy exists but runtime still depends on the boot medium,
|
||||||
|
// "failed" = toram was requested but medium is not in RAM.
|
||||||
|
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||||
|
state := s.LiveMediaRAMState()
|
||||||
|
health.ToRAMStatus = state.Status
|
||||||
|
switch state.Status {
|
||||||
|
case "ok":
|
||||||
|
return
|
||||||
|
case "failed":
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "toram_copy_failed",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: state.Message,
|
||||||
|
})
|
||||||
|
case "partial":
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "toram_copy_partial",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: state.Message,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
|
||||||
|
// suitable for log export. Sets USBExportPath to the first match found.
|
||||||
|
func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
|
||||||
|
health.USBExportPath = findUSBExportMount()
|
||||||
|
}
|
||||||
|
|
||||||
|
// findUSBExportMount returns the mount point of the first writable USB filesystem
|
||||||
|
// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
|
||||||
|
// has USB transport. Returns "" if none found.
|
||||||
|
func findUSBExportMount() string {
|
||||||
|
f, err := os.Open("/proc/mounts")
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
// fs types that are expected on USB export drives
|
||||||
|
exportFSTypes := map[string]bool{
|
||||||
|
"vfat": true,
|
||||||
|
"exfat": true,
|
||||||
|
"ext2": true,
|
||||||
|
"ext3": true,
|
||||||
|
"ext4": true,
|
||||||
|
"ntfs": true,
|
||||||
|
"ntfs3": true,
|
||||||
|
"fuseblk": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
for scanner.Scan() {
|
||||||
|
// fields: device mountpoint fstype options dump pass
|
||||||
|
fields := strings.Fields(scanner.Text())
|
||||||
|
if len(fields) < 4 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||||
|
if !exportFSTypes[strings.ToLower(fsType)] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Skip read-only mounts
|
||||||
|
opts := strings.Split(options, ",")
|
||||||
|
readOnly := false
|
||||||
|
for _, o := range opts {
|
||||||
|
if strings.TrimSpace(o) == "ro" {
|
||||||
|
readOnly = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if readOnly {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Check USB transport via lsblk on the device (or its parent disk for partitions).
|
||||||
|
if !strings.HasPrefix(device, "/dev/") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
checkDev := device
|
||||||
|
// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
|
||||||
|
// Strip trailing partition digits to get the parent disk name.
|
||||||
|
if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
|
||||||
|
checkDev = trimmed
|
||||||
|
}
|
||||||
|
if blockDeviceTransport(checkDev) == "usb" {
|
||||||
|
return mountPoint
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||||
lsmodText := commandText("lsmod")
|
lsmodText := commandText("lsmod")
|
||||||
|
|
||||||
|
|||||||
@@ -108,15 +108,15 @@ type nvidiaGPUHealth struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type nvidiaGPUStatusFile struct {
|
type nvidiaGPUStatusFile struct {
|
||||||
Index int
|
Index int
|
||||||
Name string
|
Name string
|
||||||
RunStatus string
|
RunStatus string
|
||||||
Reason string
|
Reason string
|
||||||
Health string
|
Health string
|
||||||
HealthRaw string
|
HealthRaw string
|
||||||
Observed bool
|
Observed bool
|
||||||
Selected bool
|
Selected bool
|
||||||
FailingJob string
|
FailingJob string
|
||||||
}
|
}
|
||||||
|
|
||||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||||
@@ -384,14 +384,28 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
|||||||
), logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
var (
|
||||||
if err != nil {
|
profCmd []string
|
||||||
return "", err
|
profEnv []string
|
||||||
|
)
|
||||||
|
if staggerSec > 0 && len(selected) > 1 {
|
||||||
|
profCmd = []string{
|
||||||
|
"bee-dcgmproftester-staggered",
|
||||||
|
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||||
|
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||||
|
"--devices", joinIndexList(selected),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
@@ -399,7 +413,7 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
|||||||
satJob{
|
satJob{
|
||||||
name: "03-dcgmproftester.log",
|
name: "03-dcgmproftester.log",
|
||||||
cmd: profCmd,
|
cmd: profCmd,
|
||||||
env: nvidiaVisibleDevicesEnv(selected),
|
env: profEnv,
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
@@ -538,9 +552,13 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
|
|||||||
if passes <= 0 {
|
if passes <= 0 {
|
||||||
passes = 1
|
passes = 1
|
||||||
}
|
}
|
||||||
|
// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
|
||||||
|
// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
|
||||||
|
// controller can cause memtester to spin forever on a single subtest.
|
||||||
|
timeoutSec := sizeMB*passes*150/100 + 120
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
@@ -648,11 +666,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
|||||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
|
return runDir, nil
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return archive, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type satJob struct {
|
type satJob struct {
|
||||||
@@ -838,11 +852,7 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
return runDir, nil
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return archive, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||||
@@ -905,7 +915,7 @@ func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPU
|
|||||||
entry.Health = "UNKNOWN"
|
entry.Health = "UNKNOWN"
|
||||||
}
|
}
|
||||||
if entry.Name == "" {
|
if entry.Name == "" {
|
||||||
entry.Name = "unknown"
|
entry.Name = "Unknown GPU"
|
||||||
}
|
}
|
||||||
var body strings.Builder
|
var body strings.Builder
|
||||||
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||||
@@ -1376,8 +1386,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
|||||||
if len(metricRows) > 0 {
|
if len(metricRows) > 0 {
|
||||||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||||||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||||||
chart := RenderGPUTerminalChart(metricRows)
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return out, err
|
return out, err
|
||||||
|
|||||||
@@ -223,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
return runDir, nil
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return archive, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||||
@@ -430,6 +426,101 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
|||||||
return fans, nil
|
return fans, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||||
|
// Returns the average duty cycle across all exposed PWM controls.
|
||||||
|
func sampleFanDutyCyclePct() (float64, bool) {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return parseFanDutyCyclePctSensorsJSON(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||||
|
var doc map[string]map[string]any
|
||||||
|
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
var samples []float64
|
||||||
|
for _, features := range doc {
|
||||||
|
for name, feature := range features {
|
||||||
|
if strings.EqualFold(name, "Adapter") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
featureMap, ok := feature.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if duty, ok := firstFanDutyValue(name, featureMap); ok {
|
||||||
|
samples = append(samples, duty)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return benchmarkMean(samples), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
|
||||||
|
featureName = strings.ToLower(strings.TrimSpace(featureName))
|
||||||
|
if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if strings.Contains(featureName, "pwm") {
|
||||||
|
for _, key := range []string{"input", "value", "current"} {
|
||||||
|
if value, ok := feature[key]; ok {
|
||||||
|
if duty, parsed := parseFanDutyValue(value); parsed {
|
||||||
|
return duty, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if !strings.Contains(lower, "pwm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if duty, parsed := parseFanDutyValue(feature[key]); parsed {
|
||||||
|
return duty, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseFanDutyValue(value any) (float64, bool) {
|
||||||
|
switch v := value.(type) {
|
||||||
|
case float64:
|
||||||
|
return normalizePWMAsDutyPct(v)
|
||||||
|
case string:
|
||||||
|
if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
|
||||||
|
return normalizePWMAsDutyPct(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizePWMAsDutyPct(raw float64) (float64, bool) {
|
||||||
|
if raw < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if raw <= 100 {
|
||||||
|
return raw, true
|
||||||
|
}
|
||||||
|
if raw <= 255 {
|
||||||
|
return raw / 255.0 * 100.0, true
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||||
keys := make([]string, 0, len(feature))
|
keys := make([]string, 0, len(feature))
|
||||||
for key := range feature {
|
for key := range feature {
|
||||||
|
|||||||
@@ -29,6 +29,27 @@ func TestFirstFanInputValue(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||||
|
raw := []byte(`{
|
||||||
|
"chip0": {
|
||||||
|
"fan1": {"input": 9000},
|
||||||
|
"pwm1": {"input": 128},
|
||||||
|
"pwm1_enable": {"input": 1}
|
||||||
|
},
|
||||||
|
"chip1": {
|
||||||
|
"pwm2": {"input": 64}
|
||||||
|
}
|
||||||
|
}`)
|
||||||
|
|
||||||
|
got, ok := parseFanDutyCyclePctSensorsJSON(raw)
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected duty cycle telemetry to be parsed")
|
||||||
|
}
|
||||||
|
if got < 57 || got > 58 {
|
||||||
|
t.Fatalf("got=%v want ~57.1", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseDCMIPowerReading(t *testing.T) {
|
func TestParseDCMIPowerReading(t *testing.T) {
|
||||||
raw := `
|
raw := `
|
||||||
Instantaneous power reading: 512 Watts
|
Instantaneous power reading: 512 Watts
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
|
|||||||
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||||
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||||
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||||
|
{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
|
||||||
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||||
|
|||||||
@@ -9,6 +9,17 @@ type LiveBootSource struct {
|
|||||||
Device string `json:"device,omitempty"`
|
Device string `json:"device,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type LiveMediaRAMState struct {
|
||||||
|
LiveBootSource
|
||||||
|
State string `json:"state"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
ToramActive bool `json:"toram_active,omitempty"`
|
||||||
|
CopyPresent bool `json:"copy_present,omitempty"`
|
||||||
|
CopyComplete bool `json:"copy_complete,omitempty"`
|
||||||
|
CanStartCopy bool `json:"can_start_copy,omitempty"`
|
||||||
|
Message string `json:"message,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type InterfaceInfo struct {
|
type InterfaceInfo struct {
|
||||||
Name string
|
Name string
|
||||||
State string
|
State string
|
||||||
@@ -70,6 +81,7 @@ type NvidiaStressOptions struct {
|
|||||||
Loader string
|
Loader string
|
||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
|
StaggerSeconds int
|
||||||
}
|
}
|
||||||
|
|
||||||
func New() *System {
|
func New() *System {
|
||||||
|
|||||||
@@ -15,13 +15,17 @@ type HardwareIngestRequest struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type RuntimeHealth struct {
|
type RuntimeHealth struct {
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
CheckedAt string `json:"checked_at"`
|
CheckedAt string `json:"checked_at"`
|
||||||
ExportDir string `json:"export_dir,omitempty"`
|
ExportDir string `json:"export_dir,omitempty"`
|
||||||
DriverReady bool `json:"driver_ready,omitempty"`
|
DriverReady bool `json:"driver_ready,omitempty"`
|
||||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||||
NetworkStatus string `json:"network_status,omitempty"`
|
NetworkStatus string `json:"network_status,omitempty"`
|
||||||
|
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
|
||||||
|
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||||
|
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||||
|
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||||
@@ -183,6 +187,13 @@ type HardwarePCIeDevice struct {
|
|||||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||||
|
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||||
|
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||||
|
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||||
|
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||||
|
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||||
|
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||||
|
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -35,6 +36,16 @@ var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, err
|
|||||||
return a.ListNvidiaGPUStatuses()
|
return a.ListNvidiaGPUStatuses()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
taskPriorityBenchmark = 10
|
||||||
|
taskPriorityBurn = 20
|
||||||
|
taskPriorityValidateStress = 30
|
||||||
|
taskPriorityValidate = 40
|
||||||
|
taskPriorityAudit = 50
|
||||||
|
taskPriorityInstallToRAM = 60
|
||||||
|
taskPriorityInstall = 70
|
||||||
|
)
|
||||||
|
|
||||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
var jobCounter atomic.Uint64
|
var jobCounter atomic.Uint64
|
||||||
@@ -99,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
|||||||
|
|
||||||
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||||
switch strings.TrimSpace(target) {
|
switch strings.TrimSpace(target) {
|
||||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
|
||||||
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||||
"nvidia-bandwidth", "nvidia-stress":
|
"nvidia-bandwidth", "nvidia-stress":
|
||||||
return true
|
return true
|
||||||
@@ -108,6 +119,30 @@ func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func defaultTaskPriority(target string, params taskParams) int {
|
||||||
|
switch strings.TrimSpace(target) {
|
||||||
|
case "install":
|
||||||
|
return taskPriorityInstall
|
||||||
|
case "install-to-ram":
|
||||||
|
return taskPriorityInstallToRAM
|
||||||
|
case "audit":
|
||||||
|
return taskPriorityAudit
|
||||||
|
case "nvidia-bench-perf", "nvidia-bench-power":
|
||||||
|
return taskPriorityBenchmark
|
||||||
|
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
||||||
|
return taskPriorityBurn
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
|
||||||
|
"amd", "amd-mem", "amd-bandwidth":
|
||||||
|
if params.StressMode {
|
||||||
|
return taskPriorityValidateStress
|
||||||
|
}
|
||||||
|
return taskPriorityValidate
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||||
if len(gpus) == 0 {
|
if len(gpus) == 0 {
|
||||||
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||||
@@ -209,6 +244,14 @@ func joinTaskIndices(indices []int) string {
|
|||||||
return strings.Join(parts, ",")
|
return strings.Join(parts, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func formatGPUIndexList(indices []int) string {
|
||||||
|
parts := make([]string, len(indices))
|
||||||
|
for i, idx := range indices {
|
||||||
|
parts[i] = strconv.Itoa(idx)
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ",")
|
||||||
|
}
|
||||||
|
|
||||||
func formatSplitTaskName(baseName, selectionLabel string) string {
|
func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||||
baseName = strings.TrimSpace(baseName)
|
baseName = strings.TrimSpace(baseName)
|
||||||
selectionLabel = strings.TrimSpace(selectionLabel)
|
selectionLabel = strings.TrimSpace(selectionLabel)
|
||||||
@@ -449,6 +492,7 @@ func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) {
|
|||||||
ID: newJobID("audit"),
|
ID: newJobID("audit"),
|
||||||
Name: "Audit",
|
Name: "Audit",
|
||||||
Target: "audit",
|
Target: "audit",
|
||||||
|
Priority: defaultTaskPriority("audit", taskParams{}),
|
||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
}
|
}
|
||||||
@@ -487,6 +531,8 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
StressMode bool `json:"stress_mode"`
|
StressMode bool `json:"stress_mode"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
|
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus"`
|
||||||
Loader string `json:"loader"`
|
Loader string `json:"loader"`
|
||||||
Profile string `json:"profile"`
|
Profile string `json:"profile"`
|
||||||
DisplayName string `json:"display_name"`
|
DisplayName string `json:"display_name"`
|
||||||
@@ -508,12 +554,142 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
StressMode: body.StressMode,
|
StressMode: body.StressMode,
|
||||||
GPUIndices: body.GPUIndices,
|
GPUIndices: body.GPUIndices,
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
StaggerGPUStart: body.StaggerGPUStart,
|
||||||
|
ParallelGPUs: body.ParallelGPUs,
|
||||||
Loader: body.Loader,
|
Loader: body.Loader,
|
||||||
BurnProfile: body.Profile,
|
BurnProfile: body.Profile,
|
||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
PlatformComponents: body.PlatformComponents,
|
PlatformComponents: body.PlatformComponents,
|
||||||
}
|
}
|
||||||
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, t := range tasks {
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
|
||||||
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var body struct {
|
||||||
|
Profile string `json:"profile"`
|
||||||
|
SizeMB int `json:"size_mb"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
|
RunNCCL *bool `json:"run_nccl"`
|
||||||
|
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||||
|
RampUp *bool `json:"ramp_up"`
|
||||||
|
DisplayName string `json:"display_name"`
|
||||||
|
}
|
||||||
|
if r.Body != nil {
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
runNCCL := true
|
||||||
|
if body.RunNCCL != nil {
|
||||||
|
runNCCL = *body.RunNCCL
|
||||||
|
}
|
||||||
|
parallelGPUs := false
|
||||||
|
if body.ParallelGPUs != nil {
|
||||||
|
parallelGPUs = *body.ParallelGPUs
|
||||||
|
}
|
||||||
|
rampUp := false
|
||||||
|
if body.RampUp != nil {
|
||||||
|
rampUp = *body.RampUp
|
||||||
|
}
|
||||||
|
// Build a descriptive base name that includes profile and mode so the task
|
||||||
|
// list is self-explanatory without opening individual task detail pages.
|
||||||
|
profile := strings.TrimSpace(body.Profile)
|
||||||
|
if profile == "" {
|
||||||
|
profile = "standard"
|
||||||
|
}
|
||||||
|
name := taskDisplayName(target, "", "")
|
||||||
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
|
name = body.DisplayName
|
||||||
|
}
|
||||||
|
// Append profile tag.
|
||||||
|
name = fmt.Sprintf("%s · %s", name, profile)
|
||||||
|
|
||||||
|
if target == "nvidia-bench-power" && parallelGPUs {
|
||||||
|
writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if rampUp && len(body.GPUIndices) > 1 {
|
||||||
|
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
|
||||||
|
// in Phase 2 (one additional GPU per step). A single task with all
|
||||||
|
// selected GPUs is sufficient — spawning N tasks with growing subsets
|
||||||
|
// would repeat all earlier steps redundantly.
|
||||||
|
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(resolved) < 2 {
|
||||||
|
// Fall through to normal single-task path.
|
||||||
|
rampUp = false
|
||||||
|
} else {
|
||||||
|
now := time.Now()
|
||||||
|
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||||
|
taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID("bee-bench-nvidia"),
|
||||||
|
Name: taskName,
|
||||||
|
Target: target,
|
||||||
|
Priority: defaultTaskPriority(target, taskParams{}),
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: now,
|
||||||
|
params: taskParams{
|
||||||
|
GPUIndices: append([]int(nil), resolved...),
|
||||||
|
SizeMB: body.SizeMB,
|
||||||
|
BenchmarkProfile: body.Profile,
|
||||||
|
RunNCCL: runNCCL,
|
||||||
|
ParallelGPUs: true,
|
||||||
|
RampTotal: len(resolved),
|
||||||
|
RampRunID: rampRunID,
|
||||||
|
DisplayName: taskName,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
writeTaskRunResponse(w, []*Task{t})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For non-ramp tasks append mode tag.
|
||||||
|
if parallelGPUs {
|
||||||
|
name = fmt.Sprintf("%s · parallel", name)
|
||||||
|
} else {
|
||||||
|
name = fmt.Sprintf("%s · sequential", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
params := taskParams{
|
||||||
|
GPUIndices: body.GPUIndices,
|
||||||
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
SizeMB: body.SizeMB,
|
||||||
|
BenchmarkProfile: body.Profile,
|
||||||
|
RunNCCL: runNCCL,
|
||||||
|
ParallelGPUs: parallelGPUs,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
|
}
|
||||||
|
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusBadRequest, err.Error())
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
return
|
return
|
||||||
@@ -526,56 +702,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||||
if h.opts.App == nil {
|
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
||||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var body struct {
|
|
||||||
Profile string `json:"profile"`
|
|
||||||
SizeMB int `json:"size_mb"`
|
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
|
||||||
RunNCCL *bool `json:"run_nccl"`
|
|
||||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
|
||||||
DisplayName string `json:"display_name"`
|
|
||||||
}
|
|
||||||
if r.Body != nil {
|
|
||||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
|
||||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
runNCCL := true
|
|
||||||
if body.RunNCCL != nil {
|
|
||||||
runNCCL = *body.RunNCCL
|
|
||||||
}
|
|
||||||
parallelGPUs := false
|
|
||||||
if body.ParallelGPUs != nil {
|
|
||||||
parallelGPUs = *body.ParallelGPUs
|
|
||||||
}
|
|
||||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
|
||||||
name = body.DisplayName
|
|
||||||
}
|
|
||||||
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
|
||||||
GPUIndices: body.GPUIndices,
|
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
|
||||||
SizeMB: body.SizeMB,
|
|
||||||
BenchmarkProfile: body.Profile,
|
|
||||||
RunNCCL: runNCCL,
|
|
||||||
ParallelGPUs: parallelGPUs,
|
|
||||||
DisplayName: body.DisplayName,
|
|
||||||
}, name, h.opts.App, "benchmark-nvidia")
|
|
||||||
if err != nil {
|
|
||||||
writeError(w, http.StatusBadRequest, err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
for _, t := range tasks {
|
|
||||||
globalQueue.enqueue(t)
|
|
||||||
}
|
|
||||||
writeTaskRunResponse(w, tasks)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -950,25 +1077,62 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
|||||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
status := h.opts.App.LiveBootSource()
|
status := h.currentRAMStatus()
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
_ = json.NewEncoder(w).Encode(status)
|
_ = json.NewEncoder(w).Encode(status)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ramStatusResponse struct {
|
||||||
|
platform.LiveMediaRAMState
|
||||||
|
InstallTaskActive bool `json:"install_task_active,omitempty"`
|
||||||
|
CopyTaskActive bool `json:"copy_task_active,omitempty"`
|
||||||
|
CanStartTask bool `json:"can_start_task,omitempty"`
|
||||||
|
BlockedReason string `json:"blocked_reason,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) currentRAMStatus() ramStatusResponse {
|
||||||
|
state := h.opts.App.LiveMediaRAMState()
|
||||||
|
resp := ramStatusResponse{LiveMediaRAMState: state}
|
||||||
|
if globalQueue.hasActiveTarget("install") {
|
||||||
|
resp.InstallTaskActive = true
|
||||||
|
resp.BlockedReason = "install to disk is already running"
|
||||||
|
return resp
|
||||||
|
}
|
||||||
|
if globalQueue.hasActiveTarget("install-to-ram") {
|
||||||
|
resp.CopyTaskActive = true
|
||||||
|
resp.BlockedReason = "install to RAM task is already pending or running"
|
||||||
|
return resp
|
||||||
|
}
|
||||||
|
if state.InRAM {
|
||||||
|
resp.BlockedReason = "system is already running from RAM"
|
||||||
|
return resp
|
||||||
|
}
|
||||||
|
resp.CanStartTask = state.CanStartCopy
|
||||||
|
if !resp.CanStartTask && resp.BlockedReason == "" {
|
||||||
|
resp.BlockedReason = state.Message
|
||||||
|
}
|
||||||
|
return resp
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||||
if h.opts.App == nil {
|
if h.opts.App == nil {
|
||||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if globalQueue.hasActiveTarget("install") {
|
status := h.currentRAMStatus()
|
||||||
writeError(w, http.StatusConflict, "install to disk is already running")
|
if !status.CanStartTask {
|
||||||
|
msg := strings.TrimSpace(status.BlockedReason)
|
||||||
|
if msg == "" {
|
||||||
|
msg = "install to RAM is not available"
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusConflict, msg)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
t := &Task{
|
t := &Task{
|
||||||
ID: newJobID("install-to-ram"),
|
ID: newJobID("install-to-ram"),
|
||||||
Name: "Install to RAM",
|
Name: "Install to RAM",
|
||||||
Target: "install-to-ram",
|
Target: "install-to-ram",
|
||||||
Priority: 10,
|
Priority: defaultTaskPriority("install-to-ram", taskParams{}),
|
||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
}
|
}
|
||||||
@@ -1083,7 +1247,7 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
|||||||
ID: newJobID("install"),
|
ID: newJobID("install"),
|
||||||
Name: "Install to Disk",
|
Name: "Install to Disk",
|
||||||
Target: "install",
|
Target: "install",
|
||||||
Priority: 20,
|
Priority: defaultTaskPriority("install", taskParams{}),
|
||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
params: taskParams{
|
params: taskParams{
|
||||||
@@ -1359,6 +1523,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
|
|||||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) rollbackPendingNetworkChange() error {
|
func (h *handler) rollbackPendingNetworkChange() error {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
@@ -1375,4 +1544,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,9 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
|||||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||||
t.Fatalf("burn profile=%q want smoke", got)
|
t.Fatalf("burn profile=%q want smoke", got)
|
||||||
}
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||||
@@ -61,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
@@ -75,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
}
|
}
|
||||||
task := globalQueue.tasks[0]
|
task := globalQueue.tasks[0]
|
||||||
if task.Target != "nvidia-benchmark" {
|
if task.Target != "nvidia-bench-perf" {
|
||||||
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
|
t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
|
||||||
}
|
}
|
||||||
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||||
t.Fatalf("gpu indices=%v want [1 3]", got)
|
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||||
@@ -84,6 +87,9 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
if task.params.RunNCCL {
|
if task.params.RunNCCL {
|
||||||
t.Fatal("RunNCCL should reflect explicit false from request")
|
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||||
}
|
}
|
||||||
|
if task.Priority != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||||
@@ -107,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
|||||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
@@ -133,6 +139,56 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
|||||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
}
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H100 PCIe"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 3 {
|
||||||
|
t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
for i, task := range globalQueue.tasks {
|
||||||
|
if task.Target != "nvidia-bench-power" {
|
||||||
|
t.Fatalf("task[%d] target=%q", i, task.Target)
|
||||||
|
}
|
||||||
|
if task.Priority != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
@@ -175,6 +231,41 @@ func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
|||||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
}
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||||
|
got := []int{
|
||||||
|
defaultTaskPriority("install-to-ram", taskParams{}),
|
||||||
|
defaultTaskPriority("audit", taskParams{}),
|
||||||
|
defaultTaskPriority("cpu", taskParams{}),
|
||||||
|
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||||
|
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||||
|
defaultTaskPriority("nvidia-bench-perf", taskParams{}),
|
||||||
|
defaultTaskPriority("nvidia-bench-power", taskParams{}),
|
||||||
|
}
|
||||||
|
want := []int{
|
||||||
|
taskPriorityInstallToRAM,
|
||||||
|
taskPriorityAudit,
|
||||||
|
taskPriorityValidate,
|
||||||
|
taskPriorityValidateStress,
|
||||||
|
taskPriorityBurn,
|
||||||
|
taskPriorityBenchmark,
|
||||||
|
taskPriorityBenchmark,
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
|
||||||
|
t.Fatalf("priority order=%v", got)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
|
|||||||
@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Downsample to at most ~1400 points (one per pixel) before building SVG.
|
||||||
|
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||||
|
pointCount = len(times)
|
||||||
|
|
||||||
statsLabel := chartStatsLabel(datasets)
|
statsLabel := chartStatsLabel(datasets)
|
||||||
|
|
||||||
legendItems := []metricChartSeries{}
|
legendItems := []metricChartSeries{}
|
||||||
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Downsample to at most ~1400 points before building SVG.
|
||||||
|
{
|
||||||
|
datasets := make([][]float64, len(series))
|
||||||
|
for i := range series {
|
||||||
|
datasets[i] = series[i].Values
|
||||||
|
}
|
||||||
|
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||||
|
pointCount = len(times)
|
||||||
|
for i := range series {
|
||||||
|
series[i].Values = datasets[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
scales := make([]chartScale, len(series))
|
scales := make([]chartScale, len(series))
|
||||||
for i := range series {
|
for i := range series {
|
||||||
min, max := chartSeriesBounds(series[i].Values)
|
min, max := chartSeriesBounds(series[i].Values)
|
||||||
@@ -626,6 +643,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
|
|||||||
b.WriteString(`</g>` + "\n")
|
b.WriteString(`</g>` + "\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// downsampleTimeSeries reduces the time series to at most maxPts points using
|
||||||
|
// min-max bucketing. Each bucket contributes the index of its min and max value
|
||||||
|
// (using the first full-length dataset as the reference series). All parallel
|
||||||
|
// datasets are sampled at those same indices so all series stay aligned.
|
||||||
|
// If len(times) <= maxPts the inputs are returned unchanged.
|
||||||
|
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
|
||||||
|
n := len(times)
|
||||||
|
if n <= maxPts || maxPts <= 0 {
|
||||||
|
return times, datasets
|
||||||
|
}
|
||||||
|
buckets := maxPts / 2
|
||||||
|
if buckets < 1 {
|
||||||
|
buckets = 1
|
||||||
|
}
|
||||||
|
// Use the first dataset that has the same length as times as the reference
|
||||||
|
// for deciding which two indices to keep per bucket.
|
||||||
|
var ref []float64
|
||||||
|
for _, ds := range datasets {
|
||||||
|
if len(ds) == n {
|
||||||
|
ref = ds
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
selected := make([]int, 0, maxPts)
|
||||||
|
bucketSize := float64(n) / float64(buckets)
|
||||||
|
for b := 0; b < buckets; b++ {
|
||||||
|
lo := int(math.Round(float64(b) * bucketSize))
|
||||||
|
hi := int(math.Round(float64(b+1) * bucketSize))
|
||||||
|
if hi > n {
|
||||||
|
hi = n
|
||||||
|
}
|
||||||
|
if lo >= hi {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if ref == nil {
|
||||||
|
selected = append(selected, lo)
|
||||||
|
if hi-1 != lo {
|
||||||
|
selected = append(selected, hi-1)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
minIdx, maxIdx := lo, lo
|
||||||
|
for i := lo + 1; i < hi; i++ {
|
||||||
|
if ref[i] < ref[minIdx] {
|
||||||
|
minIdx = i
|
||||||
|
}
|
||||||
|
if ref[i] > ref[maxIdx] {
|
||||||
|
maxIdx = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if minIdx <= maxIdx {
|
||||||
|
selected = append(selected, minIdx)
|
||||||
|
if maxIdx != minIdx {
|
||||||
|
selected = append(selected, maxIdx)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
selected = append(selected, maxIdx)
|
||||||
|
if minIdx != maxIdx {
|
||||||
|
selected = append(selected, minIdx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
outTimes := make([]time.Time, len(selected))
|
||||||
|
for i, idx := range selected {
|
||||||
|
outTimes[i] = times[idx]
|
||||||
|
}
|
||||||
|
outDatasets := make([][]float64, len(datasets))
|
||||||
|
for d, ds := range datasets {
|
||||||
|
if len(ds) != n {
|
||||||
|
outDatasets[d] = ds
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out := make([]float64, len(selected))
|
||||||
|
for i, idx := range selected {
|
||||||
|
out[i] = ds[idx]
|
||||||
|
}
|
||||||
|
outDatasets[d] = out
|
||||||
|
}
|
||||||
|
return outTimes, outDatasets
|
||||||
|
}
|
||||||
|
|
||||||
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||||
if !end.After(start) {
|
if !end.After(start) {
|
||||||
return float64(left+right) / 2
|
return float64(left+right) / 2
|
||||||
|
|||||||
@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
|
|||||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||||
func isSATTarget(target string) bool {
|
func isSATTarget(target string) bool {
|
||||||
switch target {
|
switch target {
|
||||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||||
"platform-stress":
|
"platform-stress":
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -261,7 +261,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
|
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||||
|
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||||
|
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||||
|
|
||||||
// Tasks
|
// Tasks
|
||||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestChartLegendNumber(t *testing.T) {
|
func TestChartLegendNumber(t *testing.T) {
|
||||||
@@ -78,6 +79,16 @@ func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
|
||||||
|
row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
|
||||||
|
if row.Status != "WARNING" {
|
||||||
|
t.Fatalf("status=%q want WARNING", row.Status)
|
||||||
|
}
|
||||||
|
if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
|
||||||
|
t.Fatalf("issue=%q", row.Issue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||||
samples := []platform.LiveMetricSample{
|
samples := []platform.LiveMetricSample{
|
||||||
{
|
{
|
||||||
@@ -637,8 +648,11 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
|||||||
`href="/benchmark"`,
|
`href="/benchmark"`,
|
||||||
`id="benchmark-gpu-list"`,
|
`id="benchmark-gpu-list"`,
|
||||||
`/api/gpu/nvidia`,
|
`/api/gpu/nvidia`,
|
||||||
`/api/benchmark/nvidia/run`,
|
`/api/bee-bench/nvidia/perf/run`,
|
||||||
|
`/api/bee-bench/nvidia/power/run`,
|
||||||
`benchmark-run-nccl`,
|
`benchmark-run-nccl`,
|
||||||
|
`Run Performance Benchmark`,
|
||||||
|
`Run Power / Thermal Fit`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
@@ -649,7 +663,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
|||||||
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
exportDir := filepath.Join(dir, "export")
|
exportDir := filepath.Join(dir, "export")
|
||||||
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
|
||||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -691,10 +705,10 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
|||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`Benchmark Results`,
|
`Perf Results`,
|
||||||
`Composite score by saved benchmark run and GPU.`,
|
`Composite score by saved benchmark run and GPU.`,
|
||||||
`GPU #0 — NVIDIA H100 PCIe`,
|
`GPU 0`,
|
||||||
`GPU #1 — NVIDIA H100 PCIe`,
|
`GPU 1`,
|
||||||
`#1`,
|
`#1`,
|
||||||
wantTime,
|
wantTime,
|
||||||
`1176.25`,
|
`1176.25`,
|
||||||
@@ -1094,6 +1108,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
|||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
|
// Runtime Health card — LiveCD checks only
|
||||||
`Runtime Health`,
|
`Runtime Health`,
|
||||||
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||||
`Export Directory`,
|
`Export Directory`,
|
||||||
@@ -1102,16 +1117,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
|||||||
`CUDA / ROCm`,
|
`CUDA / ROCm`,
|
||||||
`Required Utilities`,
|
`Required Utilities`,
|
||||||
`Bee Services`,
|
`Bee Services`,
|
||||||
`<td>CPU</td>`,
|
|
||||||
`<td>Memory</td>`,
|
|
||||||
`<td>Storage</td>`,
|
|
||||||
`<td>GPU</td>`,
|
|
||||||
`CUDA runtime is not ready for GPU SAT.`,
|
`CUDA runtime is not ready for GPU SAT.`,
|
||||||
`Missing: nvidia-smi`,
|
`Missing: nvidia-smi`,
|
||||||
`bee-nvidia=inactive`,
|
`bee-nvidia=inactive`,
|
||||||
`cpu SAT: FAILED`,
|
// Hardware Summary card — component health badges
|
||||||
`storage SAT: FAILED`,
|
`Hardware Summary`,
|
||||||
`sat:nvidia`,
|
`>CPU<`,
|
||||||
|
`>Memory<`,
|
||||||
|
`>Storage<`,
|
||||||
|
`>GPU<`,
|
||||||
|
`>PSU<`,
|
||||||
|
`badge-warn`, // cpu Warning badge
|
||||||
|
`badge-err`, // storage Critical badge
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||||
|
|||||||
@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||||
b.WriteString(benchmarkCard)
|
b.WriteString(benchmarkCard)
|
||||||
}
|
}
|
||||||
|
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||||
|
b.WriteString(powerCard)
|
||||||
|
}
|
||||||
|
|
||||||
if len(report.Charts) > 0 {
|
if len(report.Charts) > 0 {
|
||||||
for _, chart := range report.Charts {
|
for _, chart := range report.Charts {
|
||||||
@@ -251,7 +254,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
}
|
}
|
||||||
|
|
||||||
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||||
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
switch strings.TrimSpace(target) {
|
||||||
|
case "nvidia-bench-perf":
|
||||||
|
default:
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
resultPath := taskBenchmarkResultPath(logText)
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
@@ -263,7 +268,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
return renderBenchmarkResultsCardFromRuns(
|
return renderBenchmarkResultsCardFromRuns(
|
||||||
"Benchmark Results",
|
"Perf Results",
|
||||||
"Composite score for this benchmark task.",
|
"Composite score for this benchmark task.",
|
||||||
"No benchmark results were saved for this task.",
|
"No benchmark results were saved for this task.",
|
||||||
columns,
|
columns,
|
||||||
@@ -271,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderTaskPowerResultsCard(target, logText string) string {
|
||||||
|
if strings.TrimSpace(target) != "nvidia-bench-power" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
|
if strings.TrimSpace(resultPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(resultPath)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var result platform.NvidiaPowerBenchResult
|
||||||
|
if err := json.Unmarshal(raw, &result); err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
|
||||||
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
|
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
|
||||||
|
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
b.WriteString(`</table></div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
func taskBenchmarkResultPath(logText string) string {
|
func taskBenchmarkResultPath(logText string) string {
|
||||||
archivePath := taskArchivePathFromLog(logText)
|
archivePath := taskArchivePathFromLog(logText)
|
||||||
if archivePath == "" {
|
if archivePath == "" {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
if runDir == archivePath {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return filepath.Join(runDir, "result.json")
|
return filepath.Join(runDir, "result.json")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,8 @@ const (
|
|||||||
var taskNames = map[string]string{
|
var taskNames = map[string]string{
|
||||||
"nvidia": "NVIDIA SAT",
|
"nvidia": "NVIDIA SAT",
|
||||||
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
||||||
"nvidia-benchmark": "NVIDIA Benchmark",
|
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
|
||||||
|
"nvidia-bench-power": "NVIDIA Bee Bench Power",
|
||||||
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
||||||
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
||||||
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
||||||
@@ -118,6 +119,7 @@ type taskParams struct {
|
|||||||
StressMode bool `json:"stress_mode,omitempty"`
|
StressMode bool `json:"stress_mode,omitempty"`
|
||||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
|
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
|
||||||
SizeMB int `json:"size_mb,omitempty"`
|
SizeMB int `json:"size_mb,omitempty"`
|
||||||
Passes int `json:"passes,omitempty"`
|
Passes int `json:"passes,omitempty"`
|
||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
@@ -125,6 +127,9 @@ type taskParams struct {
|
|||||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
@@ -151,6 +156,12 @@ type burnPreset struct {
|
|||||||
DurationSec int
|
DurationSec int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type nvidiaRampSpec struct {
|
||||||
|
DurationSec int
|
||||||
|
StaggerSeconds int
|
||||||
|
TotalDurationSec int
|
||||||
|
}
|
||||||
|
|
||||||
func resolveBurnPreset(profile string) burnPreset {
|
func resolveBurnPreset(profile string) burnPreset {
|
||||||
switch profile {
|
switch profile {
|
||||||
case "overnight":
|
case "overnight":
|
||||||
@@ -162,6 +173,45 @@ func resolveBurnPreset(profile string) burnPreset {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolveNvidiaRampPlan(profile string, enabled bool, selected []int) (nvidiaRampSpec, error) {
|
||||||
|
base := resolveBurnPreset(profile).DurationSec
|
||||||
|
plan := nvidiaRampSpec{
|
||||||
|
DurationSec: base,
|
||||||
|
TotalDurationSec: base,
|
||||||
|
}
|
||||||
|
if !enabled {
|
||||||
|
return plan, nil
|
||||||
|
}
|
||||||
|
count := len(selected)
|
||||||
|
if count == 0 {
|
||||||
|
return nvidiaRampSpec{}, fmt.Errorf("staggered NVIDIA burn requires explicit GPU selection")
|
||||||
|
}
|
||||||
|
if count == 1 {
|
||||||
|
return plan, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
switch profile {
|
||||||
|
case "acceptance":
|
||||||
|
plan.StaggerSeconds = 10 * 60
|
||||||
|
plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
|
||||||
|
case "overnight":
|
||||||
|
plan.StaggerSeconds = 60 * 60
|
||||||
|
plan.TotalDurationSec = 8 * 60 * 60
|
||||||
|
minTotal := count * 60 * 60
|
||||||
|
if plan.TotalDurationSec < minTotal {
|
||||||
|
plan.TotalDurationSec = minTotal
|
||||||
|
}
|
||||||
|
if plan.TotalDurationSec > 10*60*60 {
|
||||||
|
return nvidiaRampSpec{}, fmt.Errorf("overnight staggered NVIDIA burn supports at most 10 GPUs")
|
||||||
|
}
|
||||||
|
plan.DurationSec = plan.TotalDurationSec - plan.StaggerSeconds*(count-1)
|
||||||
|
default:
|
||||||
|
plan.StaggerSeconds = 2 * 60
|
||||||
|
plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
|
||||||
|
}
|
||||||
|
return plan, nil
|
||||||
|
}
|
||||||
|
|
||||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||||
acceptanceCycles := []platform.PlatformStressCycle{
|
acceptanceCycles := []platform.PlatformStressCycle{
|
||||||
{LoadSec: 85, IdleSec: 5},
|
{LoadSec: 85, IdleSec: 5},
|
||||||
@@ -579,7 +629,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
dur = 300
|
dur = 300
|
||||||
}
|
}
|
||||||
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
case "nvidia-benchmark":
|
case "nvidia-bench-perf":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
@@ -591,6 +641,22 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
RunNCCL: t.params.RunNCCL,
|
RunNCCL: t.params.RunNCCL,
|
||||||
ParallelGPUs: t.params.ParallelGPUs,
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
|
}, j.append)
|
||||||
|
case "nvidia-bench-power":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
@@ -601,7 +667,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if planErr != nil {
|
||||||
|
err = planErr
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||||
|
dur = rampPlan.DurationSec
|
||||||
|
}
|
||||||
|
if rampPlan.StaggerSeconds > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||||
case "nvidia-targeted-power":
|
case "nvidia-targeted-power":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
@@ -651,11 +728,23 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
|
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if planErr != nil {
|
||||||
|
err = planErr
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||||
|
dur = rampPlan.DurationSec
|
||||||
|
}
|
||||||
|
if rampPlan.StaggerSeconds > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||||
|
}
|
||||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
DurationSec: dur,
|
DurationSec: dur,
|
||||||
Loader: t.params.Loader,
|
Loader: t.params.Loader,
|
||||||
GPUIndices: t.params.GPUIndices,
|
GPUIndices: t.params.GPUIndices,
|
||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "memory":
|
case "memory":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
|
|||||||
@@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
|||||||
taskReportMetricsDBPath = metricsPath
|
taskReportMetricsDBPath = metricsPath
|
||||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
|
||||||
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
|||||||
}
|
}
|
||||||
task := &Task{
|
task := &Task{
|
||||||
ID: "task-bench",
|
ID: "task-bench",
|
||||||
Name: "NVIDIA Benchmark",
|
Name: "NVIDIA Bee Bench Perf",
|
||||||
Target: "nvidia-benchmark",
|
Target: "nvidia-bench-perf",
|
||||||
Status: TaskDone,
|
Status: TaskDone,
|
||||||
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||||
ArtifactsDir: artifactsDir,
|
ArtifactsDir: artifactsDir,
|
||||||
}
|
}
|
||||||
ensureTaskReportPaths(task)
|
ensureTaskReportPaths(task)
|
||||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
|
||||||
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -420,9 +420,9 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
|||||||
}
|
}
|
||||||
html := string(body)
|
html := string(body)
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`Benchmark Results`,
|
`Perf Results`,
|
||||||
`Composite score for this benchmark task.`,
|
`Composite score for this benchmark task.`,
|
||||||
`GPU #0 — NVIDIA H100 PCIe`,
|
`GPU 0`,
|
||||||
`1176.25`,
|
`1176.25`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(html, needle) {
|
if !strings.Contains(html, needle) {
|
||||||
@@ -491,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveNvidiaRampPlan(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
profile string
|
||||||
|
enabled bool
|
||||||
|
selected []int
|
||||||
|
want nvidiaRampSpec
|
||||||
|
wantErr string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "disabled uses base preset",
|
||||||
|
profile: "acceptance",
|
||||||
|
selected: []int{0, 1},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "smoke ramp uses two minute steps",
|
||||||
|
profile: "smoke",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "acceptance ramp uses ten minute steps",
|
||||||
|
profile: "acceptance",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight stays at eight hours when possible",
|
||||||
|
profile: "overnight",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight extends to keep one hour after final gpu",
|
||||||
|
profile: "overnight",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight rejects impossible gpu count",
|
||||||
|
profile: "overnight",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
|
||||||
|
wantErr: "at most 10 GPUs",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "enabled requires explicit selection",
|
||||||
|
profile: "smoke",
|
||||||
|
enabled: true,
|
||||||
|
wantErr: "requires explicit GPU selection",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
|
||||||
|
if tc.wantErr != "" {
|
||||||
|
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
|
||||||
|
t.Fatalf("err=%v want substring %q", err, tc.wantErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveNvidiaRampPlan error: %v", err)
|
||||||
|
}
|
||||||
|
if got != tc.want {
|
||||||
|
t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
loader string
|
loader string
|
||||||
|
|||||||
@@ -1,5 +1,34 @@
|
|||||||
# Benchmark clock calibration research
|
# Benchmark clock calibration research
|
||||||
|
|
||||||
|
## Benchmark methodology versioning
|
||||||
|
|
||||||
|
Every benchmark methodology change must bump the benchmark version constant in
|
||||||
|
source code by exactly `+1`.
|
||||||
|
|
||||||
|
Methodology change means any change that affects comparability of benchmark
|
||||||
|
results, including for example:
|
||||||
|
- phase durations or phase order
|
||||||
|
- enabled/disabled precisions
|
||||||
|
- fallback rules
|
||||||
|
- normalization rules
|
||||||
|
- score formulas or weights
|
||||||
|
- degradation thresholds
|
||||||
|
- power calibration logic
|
||||||
|
- thermal/power penalty logic
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- benchmark version must be stored in source code as an explicit version
|
||||||
|
constant, not inferred from git tag or build metadata
|
||||||
|
- benchmark report must always print the benchmark version
|
||||||
|
- `result.json` must always include the benchmark version
|
||||||
|
- results from different benchmark versions must be treated as non-comparable by
|
||||||
|
default
|
||||||
|
|
||||||
|
Purpose:
|
||||||
|
- prevent accidental comparison of runs produced by different methodologies
|
||||||
|
- make historical benchmark archives self-describing even when detached from git
|
||||||
|
- force deliberate version bumps whenever scoring or execution semantics change
|
||||||
|
|
||||||
## Status
|
## Status
|
||||||
In progress. Baseline data from production servers pending.
|
In progress. Baseline data from production servers pending.
|
||||||
|
|
||||||
|
|||||||
117
bible-local/docs/gpu-model-propagation.md
Normal file
117
bible-local/docs/gpu-model-propagation.md
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
# GPU Model Name Propagation
|
||||||
|
|
||||||
|
How GPU model names are detected, stored, and displayed throughout the project.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Detection Sources
|
||||||
|
|
||||||
|
There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
|
||||||
|
|
||||||
|
### Pipeline A — Live / SAT (nvidia-smi query at runtime)
|
||||||
|
|
||||||
|
**File:** `audit/internal/platform/sat.go`
|
||||||
|
|
||||||
|
- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
|
||||||
|
- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
|
||||||
|
- Used by: GPU selection UI, live metrics labels, burn/stress test logic
|
||||||
|
|
||||||
|
### Pipeline B — Benchmark results
|
||||||
|
|
||||||
|
**File:** `audit/internal/platform/benchmark.go`, line 124
|
||||||
|
|
||||||
|
- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
|
||||||
|
- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
|
||||||
|
- Used by: benchmark history table, benchmark report
|
||||||
|
|
||||||
|
### Pipeline C — Hardware audit JSON (PCIe schema)
|
||||||
|
|
||||||
|
**File:** `audit/internal/schema/hardware.go`
|
||||||
|
|
||||||
|
- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
|
||||||
|
- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
|
||||||
|
- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
|
||||||
|
- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Key Inconsistency: NVIDIA PCIe Model is Never Set
|
||||||
|
|
||||||
|
`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
|
||||||
|
|
||||||
|
This means:
|
||||||
|
- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
|
||||||
|
- AMD GPUs do have their model populated
|
||||||
|
|
||||||
|
The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Benchmark History "Unknown GPU" Issue
|
||||||
|
|
||||||
|
**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
|
||||||
|
|
||||||
|
**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
|
||||||
|
|
||||||
|
This happens for:
|
||||||
|
- Older result files saved before the `Name` field was added
|
||||||
|
- Runs where nvidia-smi query failed before the benchmark started
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Fallback Strings — Current State
|
||||||
|
|
||||||
|
| Location | File | Fallback string |
|
||||||
|
|---|---|---|
|
||||||
|
| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
|
||||||
|
| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
|
||||||
|
| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
|
||||||
|
| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
|
||||||
|
| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
|
||||||
|
| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
|
||||||
|
| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
|
||||||
|
| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
|
||||||
|
|
||||||
|
**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## GPU Selection UI
|
||||||
|
|
||||||
|
**File:** `audit/internal/webui/pages.go`
|
||||||
|
|
||||||
|
- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
|
||||||
|
- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
|
||||||
|
- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
|
||||||
|
|
||||||
|
This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Flow Summary
|
||||||
|
|
||||||
|
```
|
||||||
|
nvidia-smi (live)
|
||||||
|
└─ ListNvidiaGPUs() → NvidiaGPU.Name
|
||||||
|
├─ GPU selection UI (always correct)
|
||||||
|
├─ Live metrics labels (charts_svg.go)
|
||||||
|
└─ SAT/burn status file (sat.go)
|
||||||
|
|
||||||
|
nvidia-smi (at benchmark start)
|
||||||
|
└─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
|
||||||
|
└─ BenchmarkGPUResult.Name (json:"name,omitempty")
|
||||||
|
├─ Benchmark report
|
||||||
|
└─ Benchmark history table columns
|
||||||
|
|
||||||
|
nvidia-smi / lspci (audit collection)
|
||||||
|
└─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
|
||||||
|
└─ Hardware summary page hwDescribeGPU()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What Needs Fixing
|
||||||
|
|
||||||
|
1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
|
||||||
|
2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
|
||||||
|
3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
|
||||||
@@ -1,12 +1,13 @@
|
|||||||
DEBIAN_VERSION=12
|
DEBIAN_VERSION=12
|
||||||
DEBIAN_KERNEL_ABI=auto
|
DEBIAN_KERNEL_ABI=auto
|
||||||
NVIDIA_DRIVER_VERSION=590.48.01
|
NVIDIA_DRIVER_VERSION=590.48.01
|
||||||
|
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||||
NCCL_VERSION=2.28.9-1
|
NCCL_VERSION=2.28.9-1
|
||||||
NCCL_CUDA_VERSION=13.0
|
NCCL_CUDA_VERSION=13.0
|
||||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||||
NCCL_TESTS_VERSION=2.13.10
|
NCCL_TESTS_VERSION=2.13.10
|
||||||
NVCC_VERSION=12.8
|
NVCC_VERSION=12.8
|
||||||
CUBLAS_VERSION=13.0.2.14-1
|
CUBLAS_VERSION=13.1.1.3-1
|
||||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
DCGM_VERSION=4.5.3-1
|
DCGM_VERSION=4.5.3-1
|
||||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||||
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
|||||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
AUDIT_VERSION=1.0.0
|
AUDIT_VERSION=1.0.0
|
||||||
|
MEMTEST_VERSION=6.10-4
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ lb config noauto \
|
|||||||
--bootloaders "grub-efi,syslinux" \
|
--bootloaders "grub-efi,syslinux" \
|
||||||
--debian-installer none \
|
--debian-installer none \
|
||||||
--archive-areas "main contrib non-free non-free-firmware" \
|
--archive-areas "main contrib non-free non-free-firmware" \
|
||||||
--mirror-bootstrap "https://deb.debian.org/debian" \
|
--mirror-bootstrap "http://mirror.mephi.ru/debian/" \
|
||||||
--mirror-chroot "https://deb.debian.org/debian" \
|
--mirror-chroot "http://mirror.mephi.ru/debian/" \
|
||||||
--mirror-binary "https://deb.debian.org/debian" \
|
--mirror-binary "http://mirror.mephi.ru/debian/" \
|
||||||
--security true \
|
--security true \
|
||||||
--linux-flavours "amd64" \
|
--linux-flavours "amd64" \
|
||||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ typedef void *CUstream;
|
|||||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||||
#define MAX_STRESS_STREAMS 16
|
#define MAX_STRESS_STREAMS 16
|
||||||
#define MAX_CUBLAS_PROFILES 5
|
|
||||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
|
|
||||||
@@ -643,6 +642,20 @@ static const struct profile_desc k_profiles[] = {
|
|||||||
CUDA_R_16F,
|
CUDA_R_16F,
|
||||||
CUBLAS_COMPUTE_32F_FAST_16F,
|
CUBLAS_COMPUTE_32F_FAST_16F,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"int8_tensor",
|
||||||
|
"int8",
|
||||||
|
75,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
128,
|
||||||
|
CUDA_R_8I,
|
||||||
|
CUDA_R_8I,
|
||||||
|
CUDA_R_32I,
|
||||||
|
CUDA_R_32I,
|
||||||
|
CUBLAS_COMPUTE_32I,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"fp8_e4m3",
|
"fp8_e4m3",
|
||||||
"fp8",
|
"fp8",
|
||||||
@@ -689,6 +702,8 @@ static const struct profile_desc k_profiles[] = {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
|
||||||
|
|
||||||
static int load_cublaslt(struct cublaslt_api *api) {
|
static int load_cublaslt(struct cublaslt_api *api) {
|
||||||
memset(api, 0, sizeof(*api));
|
memset(api, 0, sizeof(*api));
|
||||||
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
||||||
@@ -759,10 +774,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
|
|||||||
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case CUDA_R_32F:
|
case CUDA_R_32F:
|
||||||
|
case CUDA_R_32I:
|
||||||
return (size_t)(elements * 4u);
|
return (size_t)(elements * 4u);
|
||||||
case CUDA_R_16F:
|
case CUDA_R_16F:
|
||||||
case CUDA_R_16BF:
|
case CUDA_R_16BF:
|
||||||
return (size_t)(elements * 2u);
|
return (size_t)(elements * 2u);
|
||||||
|
case CUDA_R_8I:
|
||||||
case CUDA_R_8F_E4M3:
|
case CUDA_R_8F_E4M3:
|
||||||
case CUDA_R_8F_E5M2:
|
case CUDA_R_8F_E5M2:
|
||||||
return (size_t)(elements);
|
return (size_t)(elements);
|
||||||
@@ -775,6 +792,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
|
||||||
|
if (desc->compute_type == CUBLAS_COMPUTE_32I) {
|
||||||
|
return CUDA_R_32I;
|
||||||
|
}
|
||||||
|
if (desc->compute_type == CUBLAS_COMPUTE_64F) {
|
||||||
|
return CUDA_R_64F;
|
||||||
|
}
|
||||||
|
return CUDA_R_32F;
|
||||||
|
}
|
||||||
|
|
||||||
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
||||||
uint64_t row_tiles = (rows + 127u) / 128u;
|
uint64_t row_tiles = (rows + 127u) / 128u;
|
||||||
uint64_t col_tiles = (cols + 63u) / 64u;
|
uint64_t col_tiles = (cols + 63u) / 64u;
|
||||||
@@ -943,8 +970,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cudaDataType_t scale_type = matmul_scale_type(desc);
|
||||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
|
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -1093,17 +1121,30 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
static int run_cublas_profile(cublasLtHandle_t handle,
|
static int run_cublas_profile(cublasLtHandle_t handle,
|
||||||
struct cublaslt_api *cublas,
|
struct cublaslt_api *cublas,
|
||||||
struct prepared_profile *profile) {
|
struct prepared_profile *profile) {
|
||||||
|
int32_t alpha_i32 = 1;
|
||||||
|
int32_t beta_i32 = 0;
|
||||||
|
double alpha_f64 = 1.0;
|
||||||
|
double beta_f64 = 0.0;
|
||||||
float alpha = 1.0f;
|
float alpha = 1.0f;
|
||||||
float beta = 0.0f;
|
float beta = 0.0f;
|
||||||
|
const void *alpha_ptr = α
|
||||||
|
const void *beta_ptr = β
|
||||||
|
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
|
||||||
|
alpha_ptr = &alpha_i32;
|
||||||
|
beta_ptr = &beta_i32;
|
||||||
|
} else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
|
||||||
|
alpha_ptr = &alpha_f64;
|
||||||
|
beta_ptr = &beta_f64;
|
||||||
|
}
|
||||||
return check_cublas(profile->desc.name,
|
return check_cublas(profile->desc.name,
|
||||||
cublas->cublasLtMatmul(handle,
|
cublas->cublasLtMatmul(handle,
|
||||||
profile->op_desc,
|
profile->op_desc,
|
||||||
&alpha,
|
alpha_ptr,
|
||||||
(const void *)(uintptr_t)profile->a_dev,
|
(const void *)(uintptr_t)profile->a_dev,
|
||||||
profile->a_layout,
|
profile->a_layout,
|
||||||
(const void *)(uintptr_t)profile->b_dev,
|
(const void *)(uintptr_t)profile->b_dev,
|
||||||
profile->b_layout,
|
profile->b_layout,
|
||||||
&beta,
|
beta_ptr,
|
||||||
(const void *)(uintptr_t)profile->c_dev,
|
(const void *)(uintptr_t)profile->c_dev,
|
||||||
profile->c_layout,
|
profile->c_layout,
|
||||||
(void *)(uintptr_t)profile->d_dev,
|
(void *)(uintptr_t)profile->d_dev,
|
||||||
@@ -1121,9 +1162,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int cc_minor,
|
int cc_minor,
|
||||||
int seconds,
|
int seconds,
|
||||||
int size_mb,
|
int size_mb,
|
||||||
|
const char *precision_filter,
|
||||||
struct stress_report *report) {
|
struct stress_report *report) {
|
||||||
struct cublaslt_api cublas;
|
struct cublaslt_api cublas;
|
||||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
|
struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
|
||||||
cublasLtHandle_t handle = NULL;
|
cublasLtHandle_t handle = NULL;
|
||||||
CUcontext ctx = NULL;
|
CUcontext ctx = NULL;
|
||||||
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||||
@@ -1133,7 +1175,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int active = 0;
|
int active = 0;
|
||||||
int mp_count = 0;
|
int mp_count = 0;
|
||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
int profile_count = PROFILE_COUNT;
|
||||||
int prepared_count = 0;
|
int prepared_count = 0;
|
||||||
size_t requested_budget = 0;
|
size_t requested_budget = 0;
|
||||||
size_t total_budget = 0;
|
size_t total_budget = 0;
|
||||||
@@ -1158,8 +1200,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Count profiles matching the filter (for deciding what to run). */
|
||||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
|
||||||
|
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
|
||||||
planned++;
|
planned++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1170,18 +1214,31 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Count all profiles active on this GPU regardless of filter.
|
||||||
|
* Used as the budget divisor so matrix sizes stay consistent whether
|
||||||
|
* running all precisions together or a single-precision phase. */
|
||||||
|
int planned_total = 0;
|
||||||
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||||
|
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
||||||
|
planned_total++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (planned_total < planned) {
|
||||||
|
planned_total = planned;
|
||||||
|
}
|
||||||
|
|
||||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||||
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||||
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||||
cuda->cuStreamCreate &&
|
cuda->cuStreamCreate &&
|
||||||
cuda->cuStreamDestroy) {
|
cuda->cuStreamDestroy) {
|
||||||
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
|
||||||
}
|
}
|
||||||
if (stream_count > 1) {
|
if (stream_count > 1) {
|
||||||
int created = 0;
|
int created = 0;
|
||||||
@@ -1194,7 +1251,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
report->stream_count = stream_count;
|
report->stream_count = stream_count;
|
||||||
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
|
||||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
@@ -1218,6 +1275,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
desc->min_cc);
|
desc->min_cc);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
|
||||||
|
append_detail(report->details,
|
||||||
|
sizeof(report->details),
|
||||||
|
"%s=SKIPPED precision_filter\n",
|
||||||
|
desc->name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
for (int lane = 0; lane < stream_count; lane++) {
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
CUstream stream = streams[lane];
|
CUstream stream = streams[lane];
|
||||||
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
||||||
@@ -1335,10 +1399,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
|
||||||
|
printf("device=%s\n", report->device);
|
||||||
|
printf("device_index=%d\n", device_index);
|
||||||
|
printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
|
||||||
|
printf("backend=%s\n", report->backend);
|
||||||
|
printf("duration_s=%d\n", seconds);
|
||||||
|
printf("buffer_mb=%d\n", report->buffer_mb);
|
||||||
|
printf("streams=%d\n", report->stream_count);
|
||||||
|
printf("iterations=%lu\n", report->iterations);
|
||||||
|
printf("checksum=%llu\n", (unsigned long long)report->checksum);
|
||||||
|
if (report->details[0] != '\0') {
|
||||||
|
printf("%s", report->details);
|
||||||
|
}
|
||||||
|
printf("status=OK\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
int seconds = 5;
|
int seconds = 5;
|
||||||
int size_mb = 64;
|
int size_mb = 64;
|
||||||
int device_index = 0;
|
int device_index = 0;
|
||||||
|
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
|
||||||
|
const char *precision_plan = NULL;
|
||||||
|
const char *precision_plan_seconds = NULL;
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||||
seconds = atoi(argv[++i]);
|
seconds = atoi(argv[++i]);
|
||||||
@@ -1346,8 +1429,16 @@ int main(int argc, char **argv) {
|
|||||||
size_mb = atoi(argv[++i]);
|
size_mb = atoi(argv[++i]);
|
||||||
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||||
device_index = atoi(argv[++i]);
|
device_index = atoi(argv[++i]);
|
||||||
|
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
|
||||||
|
precision_filter = argv[++i];
|
||||||
|
} else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
|
||||||
|
precision_plan = argv[++i];
|
||||||
|
} else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
|
||||||
|
precision_plan_seconds = argv[++i];
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
fprintf(stderr,
|
||||||
|
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
|
||||||
|
argv[0]);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1407,26 +1498,94 @@ int main(int argc, char **argv) {
|
|||||||
int ok = 0;
|
int ok = 0;
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
#if HAVE_CUBLASLT_HEADERS
|
||||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
|
if (precision_plan != NULL && precision_plan[0] != '\0') {
|
||||||
|
char *plan_copy = strdup(precision_plan);
|
||||||
|
char *plan_seconds_copy = NULL;
|
||||||
|
int phase_seconds[32] = {0};
|
||||||
|
int phase_seconds_count = 0;
|
||||||
|
int phase_ok = 0;
|
||||||
|
if (plan_copy == NULL) {
|
||||||
|
fprintf(stderr, "failed to allocate precision plan buffer\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
|
||||||
|
plan_seconds_copy = strdup(precision_plan_seconds);
|
||||||
|
if (plan_seconds_copy == NULL) {
|
||||||
|
free(plan_copy);
|
||||||
|
fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
for (char *sec_token = strtok(plan_seconds_copy, ",");
|
||||||
|
sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
|
||||||
|
sec_token = strtok(NULL, ",")) {
|
||||||
|
while (*sec_token == ' ' || *sec_token == '\t') {
|
||||||
|
sec_token++;
|
||||||
|
}
|
||||||
|
if (*sec_token == '\0') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
phase_seconds[phase_seconds_count++] = atoi(sec_token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int phase_idx = 0;
|
||||||
|
for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
|
||||||
|
while (*token == ' ' || *token == '\t') {
|
||||||
|
token++;
|
||||||
|
}
|
||||||
|
if (*token == '\0') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const char *phase_name = token;
|
||||||
|
const char *phase_filter = token;
|
||||||
|
if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
|
||||||
|
phase_filter = NULL;
|
||||||
|
}
|
||||||
|
int phase_duration = seconds;
|
||||||
|
if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
|
||||||
|
phase_duration = phase_seconds[phase_idx];
|
||||||
|
}
|
||||||
|
printf("phase_begin=%s\n", phase_name);
|
||||||
|
fflush(stdout);
|
||||||
|
memset(&report, 0, sizeof(report));
|
||||||
|
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
|
||||||
|
if (ok) {
|
||||||
|
print_stress_report(&report, device_index, phase_duration);
|
||||||
|
phase_ok = 1;
|
||||||
|
} else {
|
||||||
|
printf("phase_error=%s\n", phase_name);
|
||||||
|
if (report.details[0] != '\0') {
|
||||||
|
printf("%s", report.details);
|
||||||
|
if (report.details[strlen(report.details) - 1] != '\n') {
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("status=FAILED\n");
|
||||||
|
}
|
||||||
|
printf("phase_end=%s\n", phase_name);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
free(plan_seconds_copy);
|
||||||
|
free(plan_copy);
|
||||||
|
return phase_ok ? 0 : 1;
|
||||||
|
}
|
||||||
|
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
||||||
#endif
|
#endif
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
|
if (precision_filter != NULL) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
|
||||||
|
precision_filter,
|
||||||
|
name,
|
||||||
|
cc_major,
|
||||||
|
cc_minor);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
int ptx_mb = size_mb;
|
||||||
|
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("device=%s\n", report.device);
|
print_stress_report(&report, device_index, seconds);
|
||||||
printf("device_index=%d\n", device_index);
|
|
||||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
|
||||||
printf("backend=%s\n", report.backend);
|
|
||||||
printf("duration_s=%d\n", seconds);
|
|
||||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
|
||||||
printf("streams=%d\n", report.stream_count);
|
|
||||||
printf("iterations=%lu\n", report.iterations);
|
|
||||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
|
||||||
if (report.details[0] != '\0') {
|
|
||||||
printf("%s", report.details);
|
|
||||||
}
|
|
||||||
printf("status=OK\n");
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -161,6 +161,7 @@ run_variant() {
|
|||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-e BEE_REQUIRE_MEMTEST=1 \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
"${IMAGE_REF}" \
|
||||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||||
@@ -175,6 +176,7 @@ run_variant() {
|
|||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-e BEE_REQUIRE_MEMTEST=1 \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
"${IMAGE_REF}" \
|
||||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
|||||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
|
export MEMTEST_VERSION
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
@@ -775,6 +776,7 @@ run_optional_step_sh() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
mkdir -p "${LOG_DIR}" 2>/dev/null || true
|
||||||
step_log="${LOG_DIR}/${step_slug}.log"
|
step_log="${LOG_DIR}/${step_slug}.log"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== optional step: ${step_name} ==="
|
echo "=== optional step: ${step_name} ==="
|
||||||
@@ -798,13 +800,14 @@ start_build_log
|
|||||||
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||||
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
||||||
echo "=== refreshing apt index to detect current kernel ABI ==="
|
echo "=== refreshing apt index to detect current kernel ABI ==="
|
||||||
apt-get update -qq
|
apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
|
||||||
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
||||||
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
||||||
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
||||||
| head -1)
|
| head -1)
|
||||||
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
||||||
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
||||||
|
echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
||||||
@@ -873,9 +876,37 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
|
|
||||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
|
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||||
|
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||||
|
fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||||
|
if [ -n "$fp4_type_match" ]; then
|
||||||
|
echo "fp4_header_symbol=present"
|
||||||
|
echo "$fp4_type_match"
|
||||||
|
else
|
||||||
|
echo "fp4_header_symbol=missing"
|
||||||
|
fi
|
||||||
|
if [ -n "$fp4_scale_match" ]; then
|
||||||
|
echo "fp4_scale_mode_symbol=present"
|
||||||
|
echo "$fp4_scale_match"
|
||||||
|
else
|
||||||
|
echo "fp4_scale_mode_symbol=missing"
|
||||||
|
fi
|
||||||
|
|
||||||
GPU_STRESS_NEED_BUILD=1
|
GPU_STRESS_NEED_BUILD=1
|
||||||
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
if [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||||
GPU_STRESS_NEED_BUILD=0
|
GPU_STRESS_NEED_BUILD=0
|
||||||
|
for dep in \
|
||||||
|
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||||
|
"${BUILDER_DIR}/VERSIONS"; do
|
||||||
|
if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
|
||||||
|
GPU_STRESS_NEED_BUILD=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
|
||||||
|
find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
|
||||||
|
GPU_STRESS_NEED_BUILD=1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||||
@@ -889,6 +920,12 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
else
|
else
|
||||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||||
fi
|
fi
|
||||||
|
echo "=== bee-gpu-burn compiled profile probe ==="
|
||||||
|
if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
|
||||||
|
echo "fp4_profile_string=present"
|
||||||
|
else
|
||||||
|
echo "fp4_profile_string=missing"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||||
@@ -1225,6 +1262,7 @@ fi
|
|||||||
# --- substitute version placeholders in package list and archive ---
|
# --- substitute version placeholders in package list and archive ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
sed -i \
|
sed -i \
|
||||||
|
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
|
|||||||
@@ -11,18 +11,18 @@ echo " Hardware Audit LiveCD"
|
|||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
submenu "EASY-BEE (advanced options) -->" {
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
menuentry "EASY-BEE — GSP=off" {
|
menuentry "EASY-BEE — GSP=off" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,31 +3,31 @@ label live-@FLAVOUR@-normal
|
|||||||
menu default
|
menu default
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms
|
label live-@FLAVOUR@-kms
|
||||||
menu label EASY-BEE (^graphics/KMS)
|
menu label EASY-BEE (^graphics/KMS)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
|
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE (^load to RAM)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||||
|
|
||||||
label live-@FLAVOUR@-gsp-off
|
label live-@FLAVOUR@-gsp-off
|
||||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms-gsp-off
|
label live-@FLAVOUR@-kms-gsp-off
|
||||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
|
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||||
|
|
||||||
label live-@FLAVOUR@-failsafe
|
label live-@FLAVOUR@-failsafe
|
||||||
menu label EASY-BEE (^fail-safe)
|
menu label EASY-BEE (^fail-safe)
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ ensure_bee_console_user() {
|
|||||||
ensure_bee_console_user
|
ensure_bee_console_user
|
||||||
|
|
||||||
# Enable common bee services
|
# Enable common bee services
|
||||||
|
systemctl enable bee-hpc-tuning.service
|
||||||
systemctl enable bee-network.service
|
systemctl enable bee-network.service
|
||||||
systemctl enable bee-preflight.service
|
systemctl enable bee-preflight.service
|
||||||
systemctl enable bee-audit.service
|
systemctl enable bee-audit.service
|
||||||
@@ -42,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
|||||||
# Enable GPU-vendor specific services
|
# Enable GPU-vendor specific services
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||||
systemctl enable bee-nvidia.service
|
systemctl enable bee-nvidia.service
|
||||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||||
@@ -55,6 +57,7 @@ fi
|
|||||||
# nogpu: no GPU services needed
|
# nogpu: no GPU services needed
|
||||||
|
|
||||||
# Ensure scripts are executable
|
# Ensure scripts are executable
|
||||||
|
chmod +x /usr/local/bin/bee-hpc-tuning 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ set -e
|
|||||||
|
|
||||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
|
# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
|
||||||
|
# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
|
||||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||||
BINARY_BOOT_DIR="binary/boot"
|
BINARY_BOOT_DIR="binary/boot"
|
||||||
GRUB_CFG="binary/boot/grub/grub.cfg"
|
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||||
@@ -24,15 +26,23 @@ fail_or_warn() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
|
||||||
|
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
|
||||||
|
# The template already has memtest entries hardcoded, so a missing config file
|
||||||
|
# here is not an error; validate_iso_memtest() checks the final ISO instead.
|
||||||
|
warn_only() {
|
||||||
|
log "WARNING: $1"
|
||||||
|
}
|
||||||
|
|
||||||
copy_memtest_file() {
|
copy_memtest_file() {
|
||||||
src="$1"
|
src="$1"
|
||||||
base="$(basename "$src")"
|
dst_name="${2:-$(basename "$src")}"
|
||||||
dst="${BINARY_BOOT_DIR}/${base}"
|
dst="${BINARY_BOOT_DIR}/${dst_name}"
|
||||||
|
|
||||||
[ -f "$src" ] || return 1
|
[ -f "$src" ] || return 1
|
||||||
mkdir -p "${BINARY_BOOT_DIR}"
|
mkdir -p "${BINARY_BOOT_DIR}"
|
||||||
cp "$src" "$dst"
|
cp "$src" "$dst"
|
||||||
log "copied ${base} from ${src}"
|
log "copied ${dst_name} from ${src}"
|
||||||
}
|
}
|
||||||
|
|
||||||
extract_memtest_from_deb() {
|
extract_memtest_from_deb() {
|
||||||
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {
|
|||||||
|
|
||||||
log "extracting memtest payload from ${deb}"
|
log "extracting memtest payload from ${deb}"
|
||||||
dpkg-deb -x "$deb" "$tmpdir"
|
dpkg-deb -x "$deb" "$tmpdir"
|
||||||
for f in ${MEMTEST_FILES}; do
|
|
||||||
if [ -f "${tmpdir}/boot/${f}" ]; then
|
# EFI binary: both 5.x and 6.x use memtest86+x64.efi
|
||||||
copy_memtest_file "${tmpdir}/boot/${f}"
|
if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
|
||||||
fi
|
copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
|
||||||
done
|
fi
|
||||||
|
|
||||||
|
# BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
|
||||||
|
if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
|
||||||
|
copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
|
||||||
|
elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
|
||||||
|
copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
|
||||||
|
fi
|
||||||
|
|
||||||
rm -rf "$tmpdir"
|
rm -rf "$tmpdir"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
download_and_extract_memtest() {
|
||||||
|
tmpdl="$(mktemp -d)"
|
||||||
|
if [ -n "${MEMTEST_VERSION:-}" ]; then
|
||||||
|
pkg_spec="memtest86+=${MEMTEST_VERSION}"
|
||||||
|
else
|
||||||
|
pkg_spec="memtest86+"
|
||||||
|
fi
|
||||||
|
log "downloading ${pkg_spec} from apt"
|
||||||
|
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
|
||||||
|
log "apt download failed, retrying after apt-get update"
|
||||||
|
apt-get update -qq >/dev/null 2>&1 || true
|
||||||
|
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
|
||||||
|
fi
|
||||||
|
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||||
|
if [ -n "$deb" ]; then
|
||||||
|
extract_memtest_from_deb "$deb"
|
||||||
|
else
|
||||||
|
log "apt download of memtest86+ failed"
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdl"
|
||||||
|
}
|
||||||
|
|
||||||
ensure_memtest_binaries() {
|
ensure_memtest_binaries() {
|
||||||
missing=0
|
missing=0
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
|
|||||||
done
|
done
|
||||||
[ "$missing" -eq 1 ] || return 0
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 1. Try files already placed by lb binary_memtest or chroot
|
||||||
for root in chroot/boot /boot; do
|
for root in chroot/boot /boot; do
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||||
done
|
done
|
||||||
|
# 6.x BIOS binary may lack x64 in name — copy with normalised name
|
||||||
|
if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
|
||||||
|
copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
missing=0
|
missing=0
|
||||||
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
|
|||||||
done
|
done
|
||||||
[ "$missing" -eq 1 ] || return 0
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 2. Try apt package cache (may be empty if lb binary_memtest already purged)
|
||||||
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||||
[ -d "$root" ] || continue
|
[ -d "$root" ] || continue
|
||||||
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||||
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
|
|||||||
break
|
break
|
||||||
done
|
done
|
||||||
|
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
|
||||||
|
download_and_extract_memtest
|
||||||
|
|
||||||
missing=0
|
missing=0
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||||
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {
|
|||||||
|
|
||||||
ensure_grub_entry() {
|
ensure_grub_entry() {
|
||||||
[ -f "$GRUB_CFG" ] || {
|
[ -f "$GRUB_CFG" ] || {
|
||||||
fail_or_warn "missing ${GRUB_CFG}"
|
warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,7 +169,7 @@ EOF
|
|||||||
|
|
||||||
ensure_isolinux_entry() {
|
ensure_isolinux_entry() {
|
||||||
[ -f "$ISOLINUX_CFG" ] || {
|
[ -f "$ISOLINUX_CFG" ] || {
|
||||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||||
# explicitly.
|
# explicitly.
|
||||||
|
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
|||||||
14
iso/overlay/etc/systemd/system/bee-hpc-tuning.service
Normal file
14
iso/overlay/etc/systemd/system/bee-hpc-tuning.service
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: HPC tuning (CPU governor, C-states)
|
||||||
|
After=local-fs.target
|
||||||
|
Before=bee-nvidia.service bee-audit.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-hpc-tuning.log /usr/local/bin/bee-hpc-tuning
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
RemainAfterExit=yes
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
@@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=300
|
||||||
|
STAGGER_SECONDS=180
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve_dcgmproftester() {
|
||||||
|
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||||
|
if command -v "${candidate}" >/dev/null 2>&1; then
|
||||||
|
command -v "${candidate}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
echo "loader=dcgmproftester-staggered"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
|
TMP_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
|
gpu_pos=0
|
||||||
|
WORKERS=""
|
||||||
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
gpu_pos=$((gpu_pos + 1))
|
||||||
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
|
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||||
|
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||||
|
echo "starting gpu ${id} seconds=${gpu_seconds}"
|
||||||
|
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
|
||||||
|
pid=$!
|
||||||
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
status=0
|
||||||
|
for spec in ${WORKERS}; do
|
||||||
|
pid=${spec%%:*}
|
||||||
|
rest=${spec#*:}
|
||||||
|
id=${rest%%:*}
|
||||||
|
log=${rest#*:}
|
||||||
|
if wait "${pid}"; then
|
||||||
|
echo "gpu ${id} finished: OK"
|
||||||
|
else
|
||||||
|
rc=$?
|
||||||
|
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||||
|
status=1
|
||||||
|
fi
|
||||||
|
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||||
|
done
|
||||||
|
|
||||||
|
exit "${status}"
|
||||||
29
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
29
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
@@ -2,13 +2,17 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
SECONDS=5
|
SECONDS=5
|
||||||
|
STAGGER_SECONDS=0
|
||||||
SIZE_MB=0
|
SIZE_MB=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
|
PRECISION=""
|
||||||
|
PRECISION_PLAN=""
|
||||||
|
PRECISION_PLAN_SECONDS=""
|
||||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,9 +29,13 @@ contains_csv() {
|
|||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
--precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
|
||||||
|
--precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
|
||||||
|
--precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
|
||||||
*) usage ;;
|
*) usage ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
@@ -61,14 +69,18 @@ done
|
|||||||
|
|
||||||
echo "loader=bee-gpu-burn"
|
echo "loader=bee-gpu-burn"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
TMP_DIR=$(mktemp -d)
|
TMP_DIR=$(mktemp -d)
|
||||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
|
gpu_pos=0
|
||||||
WORKERS=""
|
WORKERS=""
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
gpu_pos=$((gpu_pos + 1))
|
||||||
log="${TMP_DIR}/gpu-${id}.log"
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
gpu_size_mb="${SIZE_MB}"
|
gpu_size_mb="${SIZE_MB}"
|
||||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||||
@@ -79,11 +91,22 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
|||||||
gpu_size_mb=512
|
gpu_size_mb=512
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||||
|
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||||
|
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||||
|
precision_arg=""
|
||||||
|
[ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
|
||||||
|
precision_plan_arg=""
|
||||||
|
[ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
|
||||||
|
precision_plan_seconds_arg=""
|
||||||
|
[ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
|
||||||
CUDA_VISIBLE_DEVICES="${id}" \
|
CUDA_VISIBLE_DEVICES="${id}" \
|
||||||
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
status=0
|
status=0
|
||||||
|
|||||||
41
iso/overlay/usr/local/bin/bee-hpc-tuning
Normal file
41
iso/overlay/usr/local/bin/bee-hpc-tuning
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-hpc-tuning — apply HPC tuning for deterministic benchmarking
|
||||||
|
# Called by bee-hpc-tuning.service at boot.
|
||||||
|
|
||||||
|
log() { echo "[bee-hpc-tuning] $*"; }
|
||||||
|
|
||||||
|
# ── CPU governor ────────────────────────────────────────────────────────────
|
||||||
|
# Set all CPU cores to performance governor via sysfs.
|
||||||
|
# cpupower is not available; write directly to scaling_governor.
|
||||||
|
governor_ok=0
|
||||||
|
governor_fail=0
|
||||||
|
for gov_path in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
|
||||||
|
[ -f "$gov_path" ] || continue
|
||||||
|
if echo performance > "$gov_path" 2>/dev/null; then
|
||||||
|
governor_ok=$((governor_ok + 1))
|
||||||
|
else
|
||||||
|
governor_fail=$((governor_fail + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$governor_ok" -gt 0 ] && [ "$governor_fail" -eq 0 ]; then
|
||||||
|
log "CPU governor set to performance on ${governor_ok} core(s)"
|
||||||
|
elif [ "$governor_ok" -gt 0 ]; then
|
||||||
|
log "WARN: CPU governor: ${governor_ok} OK, ${governor_fail} failed"
|
||||||
|
elif [ "$governor_fail" -gt 0 ]; then
|
||||||
|
log "WARN: failed to set CPU governor on ${governor_fail} core(s)"
|
||||||
|
else
|
||||||
|
log "WARN: no cpufreq scaling_governor paths found (C-state governor or HW-controlled)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Transparent Huge Pages ───────────────────────────────────────────────────
|
||||||
|
# Kernel cmdline sets transparent_hugepage=always at boot, but confirm and log.
|
||||||
|
thp_path=/sys/kernel/mm/transparent_hugepage/enabled
|
||||||
|
if [ -f "$thp_path" ]; then
|
||||||
|
current=$(cat "$thp_path" 2>/dev/null)
|
||||||
|
log "transparent_hugepage: ${current}"
|
||||||
|
else
|
||||||
|
log "WARN: transparent_hugepage sysfs path not found"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "done"
|
||||||
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
@@ -2,6 +2,7 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
DURATION_SEC=300
|
DURATION_SEC=300
|
||||||
|
STAGGER_SECONDS=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
FORMAT=""
|
FORMAT=""
|
||||||
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
|||||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
|
|||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||||
@@ -170,6 +172,7 @@ done
|
|||||||
echo "loader=john"
|
echo "loader=john"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
echo "john_devices=${JOHN_DEVICES}"
|
echo "john_devices=${JOHN_DEVICES}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
cd "${JOHN_DIR}"
|
cd "${JOHN_DIR}"
|
||||||
|
|
||||||
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
|
|||||||
echo "format=${CHOSEN_FORMAT}"
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
echo "target_seconds=${DURATION_SEC}"
|
echo "target_seconds=${DURATION_SEC}"
|
||||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
_first=1
|
_first=1
|
||||||
|
pos=0
|
||||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
|
pos=$((pos + 1))
|
||||||
[ "${_first}" = "1" ] || sleep 3
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
_first=0
|
_first=0
|
||||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
|
||||||
|
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
|
||||||
|
run_john_loop "${opencl_id}" "${deadline}" &
|
||||||
pid=$!
|
pid=$!
|
||||||
PIDS="${PIDS} ${pid}"
|
PIDS="${PIDS} ${pid}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
FAIL=0
|
FAIL=0
|
||||||
for pid in ${PIDS}; do
|
for pid in ${PIDS}; do
|
||||||
|
|||||||
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {
|
|||||||
|
|
||||||
log "kernel: $(uname -r)"
|
log "kernel: $(uname -r)"
|
||||||
|
|
||||||
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
# Skip if no NVIDIA display/compute GPU is present.
|
||||||
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
|
# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
|
||||||
|
have_nvidia_gpu() {
|
||||||
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! have_nvidia_gpu; then
|
||||||
log "no NVIDIA GPU detected — skipping module load"
|
log "no NVIDIA GPU detected — skipping module load"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
@@ -253,6 +258,22 @@ else
|
|||||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||||
|
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||||
|
# training completes under nvidia-fabricmanager.
|
||||||
|
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||||
|
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||||
|
log "nvidia-fabricmanager restarted"
|
||||||
|
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||||
|
log "nvidia-fabricmanager started"
|
||||||
|
else
|
||||||
|
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||||
|
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nvidia-fabricmanager.service not installed"
|
||||||
|
fi
|
||||||
|
|
||||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ log() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
have_nvidia_gpu() {
|
have_nvidia_gpu() {
|
||||||
lspci -nn 2>/dev/null | grep -qi '10de:'
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
}
|
}
|
||||||
|
|
||||||
service_active() {
|
service_active() {
|
||||||
|
|||||||
Reference in New Issue
Block a user