Compare commits
124 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6112094d45 | |||
| e9a2bc9f9d | |||
|
|
7a8f884664 | ||
|
|
8bf8dfa45b | ||
|
|
6a22199aff | ||
|
|
ddb2bb5d1c | ||
|
|
aa284ae754 | ||
|
|
8512098174 | ||
|
|
6b5d22c194 | ||
|
|
a35e90a93e | ||
|
|
1ced81707f | ||
|
|
679aeb9947 | ||
|
|
647e99b697 | ||
|
|
4af997f436 | ||
|
|
6caace0cc0 | ||
|
|
5f0103635b | ||
|
|
84a2551dc0 | ||
|
|
1cfabc9230 | ||
|
|
5dc711de23 | ||
|
|
ab802719f8 | ||
|
|
a94e8007f8 | ||
| c69bf07b27 | |||
| b3cf8e3893 | |||
| 17118298bd | |||
| 65bcc9ce81 | |||
| 0cdfbc5875 | |||
| cf9b54b600 | |||
| 0bfb3fe954 | |||
| 3053cb0710 | |||
| 2038489961 | |||
| e35484013e | |||
| 2cdf034bb0 | |||
| b89580c24d | |||
| df1385d3d6 | |||
| f8cd9a7376 | |||
| d52ec67f8f | |||
| 61c7abaa80 | |||
| d60f7758ba | |||
| 52c3a24b76 | |||
| 028bb30333 | |||
| 7d64e5d215 | |||
| 51b721aeb3 | |||
| bac89bb6e5 | |||
| 7a618da1f9 | |||
| 64ae1c0ff0 | |||
| 49050ca717 | |||
| 5ba72ab315 | |||
| 63363e9629 | |||
|
|
5285c0d101 | ||
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c | ||
|
|
18e24a9aa5 | ||
|
|
e306250da7 | ||
|
|
c5b2081ac9 | ||
| 434528083e | |||
| 30aa30cd67 | |||
| 4f76e1de21 | |||
| 3732e64a4a | |||
| 0d925299ff | |||
| a8d5e019a5 | |||
| 72ec086568 | |||
| 7a0b0934df | |||
| d8ca0dca2c | |||
| d90250f80a | |||
| 8d6eaef5de | |||
| 732bf4cbab | |||
| fa6d905a10 | |||
|
|
5c1862ce4c | ||
|
|
b65ef2ea1d | ||
|
|
533d703c97 | ||
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d | |||
| a6a07f2626 | |||
| f87461ee4a | |||
| a636146dbd | |||
|
|
303de2df04 | ||
|
|
95124d228f | ||
|
|
54338dbae5 | ||
|
|
2be7ae6d28 | ||
|
|
b1a5035edd | ||
|
|
8fc986c933 | ||
|
|
88b5e0edf2 | ||
|
|
82fe1f6d26 | ||
| 81e7c921f8 | |||
| 0fb8f2777f | |||
| bf182daa89 | |||
| 457ea1cf04 | |||
| bf6ecab4f0 | |||
| 02e44b1172 | |||
| 2ceaa0d0ca | |||
| 9482ba20a2 | |||
| 813e2f86a9 | |||
| 58a6da9b44 | |||
| f4a19c0a00 | |||
| 9e3dcf9b4d | |||
| 098e19f760 | |||
| e16d0f34b5 | |||
|
|
525ed8b8fc | ||
|
|
4f94ebcb2c | ||
|
|
05c1fde233 | ||
| 825ef6b98a | |||
| ba16021cdb | |||
|
|
bb1218ddd4 | ||
|
|
65faae8ede | ||
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,3 +2,5 @@
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
audit/bee
|
||||
|
||||
@@ -5,22 +5,18 @@ go 1.25.0
|
||||
replace reanimator/chart => ../internal/chart
|
||||
|
||||
require (
|
||||
github.com/go-analyze/charts v0.5.26
|
||||
modernc.org/sqlite v1.48.0
|
||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/image v0.24.0 // indirect
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
modernc.org/libc v1.70.0 // indirect
|
||||
modernc.org/libc v1.72.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
modernc.org/sqlite v1.48.0 // indirect
|
||||
)
|
||||
|
||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||
|
||||
@@ -19,18 +19,22 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
|
||||
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
|
||||
)
|
||||
|
||||
type App struct {
|
||||
@@ -84,6 +88,7 @@ type installer interface {
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
LiveMediaRAMState() platform.LiveMediaRAMState
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
@@ -108,6 +113,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
|
||||
return a.installer.LiveMediaRAMState()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
@@ -117,7 +126,9 @@ type satRunner interface {
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
@@ -138,7 +149,7 @@ type satRunner interface {
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -190,6 +201,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
@@ -295,7 +307,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
}
|
||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||
data, err := os.ReadFile(DefaultAuditJSONPath)
|
||||
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -561,16 +573,66 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBenchmarkBaseDir
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchAutotuneDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||
}
|
||||
|
||||
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||
}
|
||||
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||
if err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -728,8 +790,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
@@ -926,6 +995,41 @@ func bodyOr(body, fallback string) string {
|
||||
return body
|
||||
}
|
||||
|
||||
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||
// component-status DB so they are visible in the Hardware Summary card.
|
||||
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||
if db == nil || len(psus) == 0 {
|
||||
return
|
||||
}
|
||||
const source = "audit:ipmi"
|
||||
worstStatus := "OK"
|
||||
for _, psu := range psus {
|
||||
if psu.Status == nil {
|
||||
continue
|
||||
}
|
||||
slot := "?"
|
||||
if psu.Slot != nil {
|
||||
slot = *psu.Slot
|
||||
}
|
||||
st := *psu.Status
|
||||
detail := ""
|
||||
if psu.ErrorDescription != nil {
|
||||
detail = *psu.ErrorDescription
|
||||
}
|
||||
db.Record("psu:"+slot, source, st, detail)
|
||||
switch st {
|
||||
case "Critical":
|
||||
worstStatus = "Critical"
|
||||
case "Warning":
|
||||
if worstStatus != "Critical" {
|
||||
worstStatus = "Warning"
|
||||
}
|
||||
}
|
||||
}
|
||||
db.Record("psu:all", source, worstStatus, "")
|
||||
}
|
||||
|
||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -122,11 +123,14 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||
runNCCLFn func(string, []int) (string, error)
|
||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
@@ -154,6 +158,20 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerBenchFn != nil {
|
||||
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
|
||||
if f.runNvidiaAutotuneFn != nil {
|
||||
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaTargetedStressFn != nil {
|
||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||
@@ -161,7 +179,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaComputeFn != nil {
|
||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
@@ -279,10 +297,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNCCLFn != nil {
|
||||
return f.runNCCLFn(baseDir, gpuIndices)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotBaseDir string
|
||||
var gotGPUIndices []int
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||
gotBaseDir = baseDir
|
||||
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||
return "/tmp/nccl-tests.tar.gz", nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("RunNCCLTests error: %v", err)
|
||||
}
|
||||
if path != "/tmp/nccl-tests.tar.gz" {
|
||||
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||
}
|
||||
if gotBaseDir != "/tmp/sat" {
|
||||
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||
}
|
||||
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -542,8 +593,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -580,8 +629,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -643,8 +690,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
@@ -773,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -800,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
var manifest string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
@@ -816,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read manifest entry: %v", err)
|
||||
}
|
||||
manifest = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
for _, want := range []string{
|
||||
@@ -859,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(manifest, "files:") {
|
||||
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
|
||||
}
|
||||
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
|
||||
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
|
||||
@@ -2,10 +2,29 @@ package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// readFileLimited reads path into memory, refusing files larger than maxBytes.
|
||||
// Prevents OOM on corrupted or unexpectedly large data files.
|
||||
func readFileLimited(path string, maxBytes int64) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(len(data)) > maxBytes {
|
||||
return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||
|
||||
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
data, err := readFileLimited(path, 10<<20)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bee/audit/internal/platform"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -22,6 +23,8 @@ var supportBundleServices = []string{
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
|
||||
var supportBundleCommands = []struct {
|
||||
@@ -48,13 +51,50 @@ else
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi topo -m 2>&1 || true
|
||||
else
|
||||
echo "nvidia-smi not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||
`}},
|
||||
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||
for candidate in \
|
||||
/usr/bin/nvidia-fabricmanager \
|
||||
/usr/bin/nv-fabricmanager \
|
||||
/usr/bin/nvidia-fabricmanagerd \
|
||||
/usr/bin/nvlsm; do
|
||||
if [ -e "$candidate" ]; then
|
||||
echo "=== $candidate ==="
|
||||
ls -l "$candidate" 2>&1 || true
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||
echo "no fabric manager binaries found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
||||
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== GPU $gpu ==="
|
||||
lspci -s "$gpu" -vv 2>&1 || true
|
||||
@@ -73,8 +113,13 @@ fi
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
dev=$(basename "$d")
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
class=$(cat "$d/class" 2>/dev/null)
|
||||
case "$class" in
|
||||
0x030000|0x030200) ;;
|
||||
*) continue ;;
|
||||
esac
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
@@ -190,6 +235,10 @@ var supportBundleOptionalFiles = []struct {
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
@@ -208,7 +257,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("15:04:05")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
@@ -376,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
|
||||
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
|
||||
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
|
||||
if strings.TrimSpace(cfg.Reason) != "" {
|
||||
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&body, "\nfiles:\n")
|
||||
|
||||
var files []string
|
||||
|
||||
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
|
||||
defer cancel()
|
||||
return exec.CommandContext(ctx, name, args...).Output()
|
||||
}
|
||||
|
||||
func interfaceHasCarrier(iface string) bool {
|
||||
raw, err := readNetCarrierFile(iface)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(raw) == "1"
|
||||
}
|
||||
|
||||
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
}
|
||||
|
||||
if interfaceHasCarrier(iface) {
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
}
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
}
|
||||
}
|
||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
}
|
||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||
continue
|
||||
}
|
||||
|
||||
switch {
|
||||
case key == "identifier":
|
||||
s := parseSFPIdentifier(val)
|
||||
dev.SFPIdentifier = &s
|
||||
t := true
|
||||
dev.SFPPresent = &t
|
||||
changed = true
|
||||
case key == "connector":
|
||||
s := parseSFPConnector(val)
|
||||
dev.SFPConnector = &s
|
||||
changed = true
|
||||
case key == "vendor name":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPVendor = &s
|
||||
changed = true
|
||||
case key == "vendor pn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPPartNumber = &s
|
||||
changed = true
|
||||
case key == "vendor sn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPSerialNumber = &s
|
||||
changed = true
|
||||
case strings.Contains(key, "laser wavelength"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPWavelengthNM = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "module temperature"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPTemperatureC = &f
|
||||
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
return changed
|
||||
}
|
||||
|
||||
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||
func parseSFPIdentifier(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||
// e.g. "0x07 (LC)" → "LC".
|
||||
func parseSFPConnector(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||
|
||||
func extractParens(s string) string {
|
||||
m := parenRe.FindStringSubmatch(s)
|
||||
if len(m) < 2 {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(m[1])
|
||||
}
|
||||
|
||||
func parseSFPDOM(raw string) map[string]any {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||
return map[string]any{}
|
||||
}
|
||||
out := map[string]any{}
|
||||
if dev.SFPPresent != nil {
|
||||
out["sfp_present"] = *dev.SFPPresent
|
||||
}
|
||||
if dev.SFPIdentifier != nil {
|
||||
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||
}
|
||||
if dev.SFPConnector != nil {
|
||||
out["sfp_connector"] = *dev.SFPConnector
|
||||
}
|
||||
if dev.SFPVendor != nil {
|
||||
out["sfp_vendor"] = *dev.SFPVendor
|
||||
}
|
||||
if dev.SFPPartNumber != nil {
|
||||
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||
}
|
||||
if dev.SFPSerialNumber != nil {
|
||||
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||
}
|
||||
if dev.SFPWavelengthNM != nil {
|
||||
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||
}
|
||||
if dev.SFPTemperatureC != nil {
|
||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||
}
|
||||
|
||||
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
|
||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) {
|
||||
t.Fatal("ethtool -m should not be called without carrier")
|
||||
return "", nil
|
||||
}
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
|
||||
@@ -15,6 +15,7 @@ const nvidiaVendorID = 0x10de
|
||||
type nvidiaGPUInfo struct {
|
||||
Index int
|
||||
BDF string
|
||||
Name string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
@@ -73,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
continue
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(info.Name); v != "" {
|
||||
devs[i].Model = &v
|
||||
}
|
||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||
devs[i].SerialNumber = &v
|
||||
}
|
||||
@@ -99,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -123,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 13 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||
if len(rec) < 14 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -135,17 +139,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
info := nvidiaGPUInfo{
|
||||
Index: parseRequiredInt(rec[0]),
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||
Name: strings.TrimSpace(rec[2]),
|
||||
Serial: strings.TrimSpace(rec[3]),
|
||||
VBIOS: strings.TrimSpace(rec[4]),
|
||||
TemperatureC: parseMaybeFloat(rec[5]),
|
||||
PowerW: parseMaybeFloat(rec[6]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[7]),
|
||||
ECCCorrected: parseMaybeInt64(rec[8]),
|
||||
HWSlowdown: parseMaybeBool(rec[9]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[12]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[13]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatalf("gpu by normalized bdf not found")
|
||||
}
|
||||
if gpu.Name != "NVIDIA H100 80GB HBM3" {
|
||||
t.Fatalf("name: got %q", gpu.Name)
|
||||
}
|
||||
if gpu.Serial != "GPU-SERIAL-1" {
|
||||
t.Fatalf("serial: got %q", gpu.Serial)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
@@ -79,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
||||
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
||||
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
||||
bmcPatterns := []string{
|
||||
"management system chip",
|
||||
"management controller",
|
||||
"ibmc",
|
||||
"idrac",
|
||||
"ilo vga",
|
||||
"aspeed",
|
||||
"matrox",
|
||||
}
|
||||
for _, bad := range bmcPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||
internalAMDPatterns := []string{
|
||||
"dummy function",
|
||||
@@ -153,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// SVendor/SDevice available but not in schema — skip
|
||||
|
||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
return dev
|
||||
}
|
||||
|
||||
@@ -222,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||
// speed is below the maximum negotiated speed supported by both ends.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
}
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
warn := statusWarning
|
||||
dev.Status = &warn
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
}
|
||||
}
|
||||
|
||||
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
|
||||
// Returns 0 for unrecognised values so comparisons fail safe.
|
||||
func pcieLinkSpeedRank(gen string) int {
|
||||
switch gen {
|
||||
case "Gen1":
|
||||
return 1
|
||||
case "Gen2":
|
||||
return 2
|
||||
case "Gen3":
|
||||
return 3
|
||||
case "Gen4":
|
||||
return 4
|
||||
case "Gen5":
|
||||
return 5
|
||||
case "Gen6":
|
||||
return 6
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func normalizePCILinkSpeed(raw string) string {
|
||||
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||
switch {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -29,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
{name: "raid", class: "RAID bus controller", want: true},
|
||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||
{name: "vga", class: "VGA compatible controller", want: true},
|
||||
{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
|
||||
{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
|
||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||
}
|
||||
|
||||
@@ -139,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
|
||||
ptr := func(s string) *string { return &s }
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
linkSpeed *string
|
||||
maxSpeed *string
|
||||
wantWarning bool
|
||||
wantGenIn string // substring expected in ErrorDescription when warning
|
||||
}{
|
||||
{
|
||||
name: "degraded Gen1 vs Gen5",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen1",
|
||||
},
|
||||
{
|
||||
name: "at max Gen5",
|
||||
linkSpeed: ptr("Gen5"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "degraded Gen4 vs Gen5",
|
||||
linkSpeed: ptr("Gen4"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen4",
|
||||
},
|
||||
{
|
||||
name: "missing current speed — no warning",
|
||||
linkSpeed: nil,
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "missing max speed — no warning",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: nil,
|
||||
wantWarning: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
ok := statusOK
|
||||
dev.Status = &ok
|
||||
dev.LinkSpeed = tt.linkSpeed
|
||||
dev.MaxLinkSpeed = tt.maxSpeed
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
gotWarn := dev.Status != nil && *dev.Status == statusWarning
|
||||
if gotWarn != tt.wantWarning {
|
||||
t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
|
||||
}
|
||||
if tt.wantWarning {
|
||||
if dev.ErrorDescription == nil {
|
||||
t.Fatal("expected ErrorDescription to be set")
|
||||
}
|
||||
if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
|
||||
t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
|
||||
}
|
||||
} else {
|
||||
if dev.ErrorDescription != nil {
|
||||
t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,11 +160,57 @@ type psuSDR struct {
|
||||
}
|
||||
|
||||
var psuSlotPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
|
||||
// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
|
||||
// does not fire after the digit; match explicitly with underscore terminator.
|
||||
regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
|
||||
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
|
||||
// Must be last: "power supply N" is already caught by the pattern above.
|
||||
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
|
||||
}
|
||||
|
||||
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_IN, PSU1_PIN
|
||||
// MLT: PSU1_PIN
|
||||
// xFusion: (matched via default fallback — no explicit keyword)
|
||||
// HPE: PS1 Input Power, PS1 Input Watts
|
||||
func isPSUInputPower(name string) bool {
|
||||
return strings.Contains(name, "input power") ||
|
||||
strings.Contains(name, "input watts") ||
|
||||
strings.Contains(name, "_pin") ||
|
||||
strings.Contains(name, " pin") ||
|
||||
strings.Contains(name, "_power_in") ||
|
||||
strings.Contains(name, "power_in")
|
||||
}
|
||||
|
||||
// isPSUOutputPower matches DC-output power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_OUT
|
||||
// MLT: PSU1_POUT
|
||||
// xFusion: PS1 POut
|
||||
func isPSUOutputPower(name string) bool {
|
||||
return strings.Contains(name, "output power") ||
|
||||
strings.Contains(name, "output watts") ||
|
||||
strings.Contains(name, "_pout") ||
|
||||
strings.Contains(name, " pout") ||
|
||||
strings.Contains(name, "_power_out") ||
|
||||
strings.Contains(name, "power_out") ||
|
||||
strings.Contains(name, "power supply bay") ||
|
||||
strings.Contains(name, "psu bay")
|
||||
}
|
||||
|
||||
// parseBoundedFloat parses a numeric value from an SDR value field and
|
||||
// validates it is within (0, max]. Returns nil for zero, negative, or
|
||||
// out-of-range values — these indicate missing/off/fault sensor readings.
|
||||
func parseBoundedFloat(raw string, max float64) *float64 {
|
||||
v := parseFloatPtr(raw)
|
||||
if v == nil || *v <= 0 || *v > max {
|
||||
return nil
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
@@ -194,24 +240,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
|
||||
lowerName := strings.ToLower(name)
|
||||
switch {
|
||||
case strings.Contains(lowerName, "input power"):
|
||||
entry.inputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "output power"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case isPSUInputPower(lowerName):
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
case isPSUOutputPower(lowerName):
|
||||
entry.outputPowerW = parseBoundedFloat(value, 6000)
|
||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||
entry.inputVoltage = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "temp"):
|
||||
entry.temperatureC = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||
entry.healthPct = parsePercentPtr(value)
|
||||
default:
|
||||
// Generic PSU power reading: sensor matched a slot pattern but carries
|
||||
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
|
||||
// AC input if the value looks like wattage and no better data is set yet.
|
||||
if entry.inputPowerW == nil {
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
}
|
||||
}
|
||||
out[slot] = entry
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// PSUSlotPower holds SDR power readings for one PSU slot.
|
||||
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
|
||||
// matching HardwarePowerSupply.Slot in the audit schema.
|
||||
type PSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"`
|
||||
OutputW *float64 `json:"output_w,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
|
||||
// using the same battle-tested slot patterns as the hardware audit collector.
|
||||
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
|
||||
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
|
||||
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
|
||||
sdr := parsePSUSDR(sdrOutput)
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]PSUSlotPower, len(sdr))
|
||||
for slot, entry := range sdr {
|
||||
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
|
||||
out[key] = PSUSlotPower{
|
||||
InputW: entry.inputPowerW,
|
||||
OutputW: entry.outputPowerW,
|
||||
Status: entry.status,
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
|
||||
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
{name: "PWS1 Status", want: 1},
|
||||
{name: "Power Supply Bay 8", want: 8},
|
||||
{name: "PS 6 Input Power", want: 6},
|
||||
// MSI underscore format — \b does not fire between digit and '_'
|
||||
{name: "PSU1_POWER_IN", want: 1},
|
||||
{name: "PSU2_POWER_OUT", want: 2},
|
||||
{name: "PSU4_STATUS", want: 4},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePSUSDRMSIFormat(t *testing.T) {
|
||||
t.Parallel()
|
||||
raw := `
|
||||
PSU1_STATUS | F1h | ok
|
||||
PSU1_POWER_OUT | 928 Watts | ok
|
||||
PSU1_POWER_IN | 976 Watts | ok
|
||||
PSU2_STATUS | F2h | ok
|
||||
PSU2_POWER_OUT | 944 Watts | ok
|
||||
PSU2_POWER_IN | 992 Watts | ok
|
||||
`
|
||||
got := parsePSUSDR(raw)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d want 2", len(got))
|
||||
}
|
||||
if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
|
||||
t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
|
||||
}
|
||||
if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
|
||||
t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
|
||||
}
|
||||
if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
|
||||
t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
@@ -0,0 +1,735 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
benchmarkPowerAutotuneVersion = 1
|
||||
benchmarkPowerAutotuneIdleSec = 60
|
||||
benchmarkPowerAutotuneLoadSec = 90
|
||||
benchmarkPowerAutotuneSampleInterval = 3
|
||||
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
|
||||
)
|
||||
|
||||
func BenchmarkPowerSourceConfigPath(baseDir string) string {
|
||||
baseDir = strings.TrimSpace(baseDir)
|
||||
if baseDir == "" {
|
||||
return defaultBenchmarkPowerSourceConfigPath
|
||||
}
|
||||
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
|
||||
}
|
||||
|
||||
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var cfg BenchmarkPowerAutotuneConfig
|
||||
if err := json.Unmarshal(raw, &cfg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(cfg.SelectedSource) == "" {
|
||||
return nil, fmt.Errorf("autotune config missing selected_source")
|
||||
}
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return fmt.Errorf("empty autotune config path")
|
||||
}
|
||||
if cfg.Version <= 0 {
|
||||
cfg.Version = benchmarkPowerAutotuneVersion
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := json.MarshalIndent(cfg, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, path)
|
||||
}
|
||||
|
||||
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
|
||||
}
|
||||
|
||||
func ResetBenchmarkPowerAutotuneConfig(path string) error {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return fmt.Errorf("empty autotune config path")
|
||||
}
|
||||
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func normalizeBenchmarkPowerSource(source string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(source)) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
return BenchmarkPowerSourceSDRPSUInput
|
||||
default:
|
||||
return BenchmarkPowerSourceDCMI
|
||||
}
|
||||
}
|
||||
|
||||
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
|
||||
cfg, err := LoadSystemPowerSourceConfig(exportDir)
|
||||
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
|
||||
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: true,
|
||||
SelectedSource: selected,
|
||||
EffectiveSource: selected,
|
||||
Mode: "autotuned",
|
||||
Reason: strings.TrimSpace(cfg.Reason),
|
||||
ConfiguredAt: cfg.UpdatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
sources := sampleBenchmarkPowerSources()
|
||||
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: false,
|
||||
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
|
||||
Mode: "fallback",
|
||||
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
|
||||
}
|
||||
}
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: false,
|
||||
EffectiveSource: BenchmarkPowerSourceDCMI,
|
||||
Mode: "fallback",
|
||||
Reason: "autotune config not found; using temporary fallback source dcmi",
|
||||
}
|
||||
}
|
||||
|
||||
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
|
||||
decision := ResolveSystemPowerDecision(exportDir)
|
||||
if decision.EffectiveSource != "" {
|
||||
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
|
||||
return value, decision, nil
|
||||
} else if decision.Configured {
|
||||
fallback := BenchmarkPowerSourceDCMI
|
||||
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
|
||||
fallback = BenchmarkPowerSourceSDRPSUInput
|
||||
}
|
||||
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
|
||||
decision.Mode = "degraded"
|
||||
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
|
||||
decision.EffectiveSource = fallback
|
||||
return fallbackValue, decision, nil
|
||||
}
|
||||
decision.Mode = "degraded"
|
||||
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
|
||||
return 0, decision, err
|
||||
}
|
||||
}
|
||||
return 0, decision, fmt.Errorf("system power source unavailable")
|
||||
}
|
||||
|
||||
func queryBenchmarkPowerSourceW(source string) (float64, error) {
|
||||
switch normalizeBenchmarkPowerSource(source) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
sdr := sampleIPMISDRPowerSensors()
|
||||
if sdr.PSUInW > 0 {
|
||||
return sdr.PSUInW, nil
|
||||
}
|
||||
return 0, fmt.Errorf("sdr psu input unavailable")
|
||||
default:
|
||||
return queryIPMIServerPowerW()
|
||||
}
|
||||
}
|
||||
|
||||
func sampleBenchmarkPowerSources() map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
|
||||
out[BenchmarkPowerSourceDCMI] = w
|
||||
}
|
||||
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
|
||||
out[BenchmarkPowerSourceSDRPSUInput] = w
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
|
||||
if durationSec <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
|
||||
if durationSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||
}
|
||||
close(stopCh)
|
||||
return <-doneCh
|
||||
}
|
||||
|
||||
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
|
||||
if intervalSec <= 0 {
|
||||
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||
}
|
||||
ch := make(chan []float64, 1)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
var samples []float64
|
||||
record := func() {
|
||||
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
}
|
||||
record()
|
||||
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
ch <- samples
|
||||
return
|
||||
case <-ticker.C:
|
||||
record()
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
type benchmarkPowerAutotuneSample struct {
|
||||
ElapsedSec float64
|
||||
GPUAvgUsagePct float64
|
||||
CPUUsagePct float64
|
||||
GPUSumPowerW float64
|
||||
Sources map[string]float64
|
||||
}
|
||||
|
||||
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
|
||||
if durationSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
var out []benchmarkPowerAutotuneSample
|
||||
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||
start := time.Now()
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
return out
|
||||
}
|
||||
row := benchmarkPowerAutotuneSample{
|
||||
ElapsedSec: time.Since(start).Seconds(),
|
||||
CPUUsagePct: sampleCPULoadPct(),
|
||||
Sources: sampleBenchmarkPowerSources(),
|
||||
}
|
||||
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
|
||||
var usageSum float64
|
||||
for _, gpu := range gpuRows {
|
||||
row.GPUSumPowerW += gpu.PowerW
|
||||
usageSum += gpu.UsagePct
|
||||
}
|
||||
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
|
||||
}
|
||||
out = append(out, row)
|
||||
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
|
||||
if time.Now().After(deadline) {
|
||||
return out
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return out
|
||||
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||
if logFunc == nil {
|
||||
return
|
||||
}
|
||||
var sourceParts []string
|
||||
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||
if value, ok := sample.Sources[source]; ok && value > 0 {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
|
||||
} else {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
|
||||
}
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
|
||||
phase,
|
||||
sample.ElapsedSec,
|
||||
sample.GPUAvgUsagePct,
|
||||
sample.GPUSumPowerW,
|
||||
sample.CPUUsagePct,
|
||||
strings.Join(sourceParts, " "),
|
||||
))
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||
if logFunc == nil || len(samples) == 0 {
|
||||
return
|
||||
}
|
||||
var gpuUsage []float64
|
||||
var cpuUsage []float64
|
||||
var gpuPower []float64
|
||||
sourceBuckets := map[string][]float64{}
|
||||
for _, sample := range samples {
|
||||
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||
gpuPower = append(gpuPower, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
sourceBuckets[source] = append(sourceBuckets[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
var sourceParts []string
|
||||
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||
values := sourceBuckets[source]
|
||||
if len(values) == 0 {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
|
||||
continue
|
||||
}
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
|
||||
phase,
|
||||
len(samples),
|
||||
benchmarkMean(gpuUsage),
|
||||
benchmarkPercentile(gpuUsage, 95),
|
||||
benchmarkMean(gpuPower),
|
||||
benchmarkMean(cpuUsage),
|
||||
benchmarkPercentile(cpuUsage, 95),
|
||||
strings.Join(sourceParts, " "),
|
||||
))
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
|
||||
if logFunc == nil {
|
||||
return
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
if !candidate.Available {
|
||||
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
|
||||
continue
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
|
||||
candidate.Source,
|
||||
candidate.IdleAvgW,
|
||||
candidate.LoadAvgW,
|
||||
candidate.DeltaW,
|
||||
gpuDelta,
|
||||
candidate.RelativeError,
|
||||
candidate.Confidence*100,
|
||||
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
|
||||
))
|
||||
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
|
||||
result := &BenchmarkPowerAutotuneValidation{}
|
||||
if len(samples) == 0 {
|
||||
result.Reason = "no idle telemetry samples collected"
|
||||
return result
|
||||
}
|
||||
var gpuUsage []float64
|
||||
var cpuUsage []float64
|
||||
for _, sample := range samples {
|
||||
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||
if sample.CPUUsagePct > 0 {
|
||||
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||
}
|
||||
}
|
||||
result.GPUSamples = len(gpuUsage)
|
||||
result.CPUSamples = len(cpuUsage)
|
||||
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
|
||||
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
|
||||
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
|
||||
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
|
||||
switch {
|
||||
case result.GPUAvgUsagePct > 5:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
|
||||
case result.GPUP95UsagePct > 10:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
|
||||
case result.CPUAvgUsagePct > 20:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
|
||||
case result.CPUP95UsagePct > 35:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
|
||||
default:
|
||||
result.Valid = true
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
|
||||
idleBySource := map[string][]float64{}
|
||||
loadBySource := map[string][]float64{}
|
||||
var idleGPU []float64
|
||||
var loadGPU []float64
|
||||
for _, sample := range idle {
|
||||
idleGPU = append(idleGPU, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
idleBySource[source] = append(idleBySource[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, sample := range load {
|
||||
loadGPU = append(loadGPU, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
loadBySource[source] = append(loadBySource[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
idleGPUAvg := benchmarkMean(idleGPU)
|
||||
loadGPUAvg := benchmarkMean(loadGPU)
|
||||
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||
if gpuDelta <= 0 {
|
||||
gpuDelta = loadGPUAvg
|
||||
}
|
||||
|
||||
candidates := []BenchmarkPowerAutotuneCandidate{
|
||||
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
|
||||
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
|
||||
}
|
||||
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
if candidate.Available && candidate.DeltaW > 0 {
|
||||
available = append(available, candidate)
|
||||
}
|
||||
}
|
||||
if len(available) == 0 {
|
||||
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
|
||||
}
|
||||
sort.Slice(available, func(i, j int) bool {
|
||||
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
|
||||
if available[i].Source != available[j].Source {
|
||||
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
|
||||
}
|
||||
}
|
||||
if available[i].RelativeError != available[j].RelativeError {
|
||||
return available[i].RelativeError < available[j].RelativeError
|
||||
}
|
||||
return available[i].Samples > available[j].Samples
|
||||
})
|
||||
selected := available[0]
|
||||
for idx := range candidates {
|
||||
if candidates[idx].Source == selected.Source {
|
||||
candidates[idx].Selected = true
|
||||
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
|
||||
}
|
||||
}
|
||||
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
|
||||
}
|
||||
|
||||
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
|
||||
candidate := BenchmarkPowerAutotuneCandidate{
|
||||
Source: source,
|
||||
Available: len(idle) > 0 && len(load) > 0,
|
||||
Samples: minInt(len(idle), len(load)),
|
||||
}
|
||||
if !candidate.Available {
|
||||
return candidate
|
||||
}
|
||||
candidate.IdleAvgW = benchmarkMean(idle)
|
||||
candidate.LoadAvgW = benchmarkMean(load)
|
||||
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
|
||||
if gpuDelta > 0 {
|
||||
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
|
||||
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
|
||||
}
|
||||
return candidate
|
||||
}
|
||||
|
||||
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "status=%s\n", result.Status)
|
||||
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
|
||||
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
|
||||
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
|
||||
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
|
||||
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
|
||||
if result.SelectedSource != "" {
|
||||
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
|
||||
}
|
||||
if result.IdleValidation != nil {
|
||||
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
|
||||
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
|
||||
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
|
||||
if result.IdleValidation.Reason != "" {
|
||||
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
|
||||
}
|
||||
}
|
||||
for _, candidate := range result.Candidates {
|
||||
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
|
||||
if candidate.Available {
|
||||
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
|
||||
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
|
||||
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
|
||||
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
|
||||
var b strings.Builder
|
||||
b.WriteString("# Bee Bench Power Source Autotune\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
|
||||
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
|
||||
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
|
||||
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
|
||||
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
|
||||
if result.SelectedSource != "" {
|
||||
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
if result.IdleValidation != nil {
|
||||
b.WriteString("## Idle Validation\n\n")
|
||||
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
|
||||
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
|
||||
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
|
||||
if result.IdleValidation.Reason != "" {
|
||||
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.Candidates) > 0 {
|
||||
b.WriteString("## Candidates\n\n")
|
||||
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
|
||||
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
|
||||
for _, candidate := range result.Candidates {
|
||||
if !candidate.Available {
|
||||
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
|
||||
continue
|
||||
}
|
||||
selected := "no"
|
||||
if candidate.Selected {
|
||||
selected = "yes"
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
|
||||
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
|
||||
allDevices := joinIndexList(gpuIndices)
|
||||
switch strings.TrimSpace(strings.ToLower(kind)) {
|
||||
case "power-fit", "power", "nvidia-bench-power":
|
||||
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
|
||||
if err == nil {
|
||||
return cmd, "power-fit"
|
||||
}
|
||||
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
|
||||
default:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", fmt.Sprintf("%d", durationSec),
|
||||
"--devices", allDevices,
|
||||
}
|
||||
if sizeMB > 0 {
|
||||
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
|
||||
}
|
||||
return cmd, "performance"
|
||||
}
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = "/var/log/bee-bench/autotune"
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
selected, err := resolveNvidiaGPUSelection(nil, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "autotune-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
hostname, _ := os.Hostname()
|
||||
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
|
||||
result := BenchmarkPowerAutotuneResult{
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Hostname: hostname,
|
||||
ServerModel: readServerModel(),
|
||||
BenchmarkKind: normalizedKind,
|
||||
Profile: opts.Profile,
|
||||
Status: "FAILED",
|
||||
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
|
||||
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
|
||||
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
|
||||
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
|
||||
if result.IdleValidation == nil || !result.IdleValidation.Valid {
|
||||
if result.IdleValidation != nil {
|
||||
result.IdleValidationError = result.IdleValidation.Reason
|
||||
logFunc(result.IdleValidation.Reason)
|
||||
}
|
||||
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, fmt.Errorf("%s", result.IdleValidationError)
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
|
||||
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
|
||||
go func() {
|
||||
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
|
||||
}()
|
||||
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
|
||||
loadSamples := <-loadSamplesCh
|
||||
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
|
||||
if runErr != nil {
|
||||
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
|
||||
}
|
||||
|
||||
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
|
||||
result.Candidates = candidates
|
||||
result.GPUPowerIdleW = idleGPUAvg
|
||||
result.GPUPowerLoadW = loadGPUAvg
|
||||
if chooseErr != nil {
|
||||
result.Notes = append(result.Notes, chooseErr.Error())
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, chooseErr
|
||||
}
|
||||
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||
if gpuDelta <= 0 {
|
||||
gpuDelta = loadGPUAvg
|
||||
}
|
||||
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
|
||||
result.SelectedSource = selectedSource
|
||||
result.Status = "OK"
|
||||
var confidence float64
|
||||
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
|
||||
for _, candidate := range candidates {
|
||||
if candidate.Selected {
|
||||
confidence = candidate.Confidence
|
||||
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||
selectionReason = candidate.SelectionNotes
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
cfg := BenchmarkPowerAutotuneConfig{
|
||||
Version: benchmarkPowerAutotuneVersion,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
SelectedSource: selectedSource,
|
||||
BenchmarkKind: normalizedKind,
|
||||
Profile: opts.Profile,
|
||||
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
Confidence: confidence,
|
||||
Reason: selectionReason,
|
||||
}
|
||||
result.Config = &cfg
|
||||
configPath := BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
|
||||
result.Status = "FAILED"
|
||||
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
|
||||
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
return runDir, err
|
||||
}
|
||||
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
|
||||
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal autotune result: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||
return fmt.Errorf("write autotune result.json: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
|
||||
return fmt.Errorf("write autotune summary.txt: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
|
||||
return fmt.Errorf("write autotune report.md: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func minInt(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
var _ = exec.ErrNotFound
|
||||
@@ -2,25 +2,15 @@ package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||
return renderBenchmarkReportWithCharts(result, nil)
|
||||
return renderBenchmarkReportWithCharts(result)
|
||||
}
|
||||
|
||||
type benchmarkReportChart struct {
|
||||
Title string
|
||||
Content string
|
||||
}
|
||||
|
||||
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
||||
|
||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
|
||||
// ── Header ────────────────────────────────────────────────────────────────
|
||||
@@ -58,11 +48,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||
}
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
if result.ParallelGPUs {
|
||||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||
if result.RampRunID != "" {
|
||||
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||||
}
|
||||
} else if result.ParallelGPUs {
|
||||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||
}
|
||||
if result.ScalabilityScore > 0 {
|
||||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||
}
|
||||
if result.PlatformPowerScore > 0 {
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||||
}
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
b.WriteString("\n")
|
||||
|
||||
@@ -83,36 +84,164 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||
b.WriteString("## Scorecard\n\n")
|
||||
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown"
|
||||
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||||
b.WriteString("## Balanced Scorecard\n\n")
|
||||
|
||||
// Perspective 1: Compatibility — hard stops
|
||||
b.WriteString("### 1. Compatibility\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
thermalThrottle := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
fanAtThrottle := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
ecc := "-"
|
||||
if gpu.ECC.Uncorrected > 0 {
|
||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||
}
|
||||
compatStatus := "✓ OK"
|
||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||
compatStatus = "⛔ HARD STOP"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||||
}
|
||||
interconnect := "-"
|
||||
if gpu.Scores.InterconnectScore > 0 {
|
||||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||
gpu.Index, name,
|
||||
gpu.Status,
|
||||
gpu.Scores.CompositeScore,
|
||||
gpu.Scores.ComputeScore,
|
||||
topsPerSM,
|
||||
gpu.Scores.PowerSustainScore,
|
||||
gpu.Scores.ThermalSustainScore,
|
||||
gpu.Scores.StabilityScore,
|
||||
interconnect,
|
||||
)
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 2: Thermal headroom
|
||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
switch {
|
||||
case headroom < 10:
|
||||
thermalStatus = "⛔ CRITICAL"
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
thermalStatus = "⚠ WARNING"
|
||||
}
|
||||
throttlePct := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||||
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||||
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||||
fmt.Sprintf("%.1f°C", headroom),
|
||||
throttlePct,
|
||||
thermalStatus,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 3: Power delivery
|
||||
b.WriteString("### 3. Power Delivery\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
powerCap := "-"
|
||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||
}
|
||||
fanDuty := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
powerStatus := "✓ OK"
|
||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||
powerStatus = "⚠ POWER LIMITED"
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
powerCap,
|
||||
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||||
fanDuty,
|
||||
powerStatus,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 4: Performance
|
||||
b.WriteString("### 4. Performance\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
}
|
||||
mixed := "-"
|
||||
if gpu.Scores.MixedScore > 0 {
|
||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||
}
|
||||
mixedEff := "-"
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||||
synthetic, mixed, mixedEff, topsPerSM,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 5: Anomaly flags
|
||||
b.WriteString("### 5. Anomalies\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
eccCorr := "-"
|
||||
if gpu.ECC.Corrected > 0 {
|
||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||
}
|
||||
syncBoost := "-"
|
||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||
}
|
||||
powerVar := "OK"
|
||||
if gpu.Scores.PowerSustainScore < 70 {
|
||||
powerVar = "⚠ unstable"
|
||||
}
|
||||
thermalVar := "OK"
|
||||
if gpu.Scores.ThermalSustainScore < 70 {
|
||||
thermalVar = "⚠ unstable"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||
b.WriteString("## Per-GPU Details\n\n")
|
||||
@@ -139,20 +268,75 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
if gpu.PowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||
}
|
||||
if gpu.PowerLimitDerated {
|
||||
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||||
}
|
||||
if gpu.CalibratedPeakPowerW > 0 {
|
||||
if gpu.CalibratedPeakTempC > 0 {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||||
} else {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||||
}
|
||||
}
|
||||
if gpu.LockedGraphicsClockMHz > 0 {
|
||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Steady-state telemetry
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||
b.WriteString("\n")
|
||||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"", "Avg", "P95"},
|
||||
[][]string{
|
||||
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||||
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||||
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||||
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||||
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||
}
|
||||
|
||||
// Per-precision stability phases.
|
||||
if len(gpu.PrecisionSteady) > 0 {
|
||||
b.WriteString("**Per-precision stability:**\n\n")
|
||||
var precRows [][]string
|
||||
for _, p := range gpu.PrecisionSteady {
|
||||
eccCorr := "—"
|
||||
eccUncorr := "—"
|
||||
if !p.ECC.IsZero() {
|
||||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||||
}
|
||||
status := p.Status
|
||||
if strings.TrimSpace(status) == "" {
|
||||
status = "OK"
|
||||
}
|
||||
precRows = append(precRows, []string{
|
||||
p.Precision, status,
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||||
eccCorr, eccUncorr,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
// Legacy: show combined-window variance.
|
||||
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||||
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||||
}
|
||||
|
||||
// ECC summary
|
||||
if !gpu.ECC.IsZero() {
|
||||
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||||
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||||
}
|
||||
|
||||
// Throttle
|
||||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||
@@ -163,14 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
// Precision results
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
b.WriteString("**Precision results:**\n\n")
|
||||
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
|
||||
var presRows [][]string
|
||||
for _, p := range gpu.PrecisionResults {
|
||||
if p.Supported {
|
||||
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
|
||||
presRows = append(presRows, []string{
|
||||
p.Name,
|
||||
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||||
fmt.Sprintf("×%.3g", p.Weight),
|
||||
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||||
fmt.Sprintf("%d", p.Lanes),
|
||||
fmt.Sprintf("%d", p.Iterations),
|
||||
})
|
||||
} else {
|
||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
|
||||
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||||
}
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
@@ -192,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||
if result.Interconnect.Supported {
|
||||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Avg", "Max"},
|
||||
[][]string{
|
||||
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||||
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Interconnect.Notes {
|
||||
@@ -205,20 +401,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
}
|
||||
}
|
||||
|
||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||
// ── Server Power ───────────────────────────────────────────────────────────
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server Power (IPMI)\n\n")
|
||||
title := "## Server Power\n\n"
|
||||
if sp.Source != "" {
|
||||
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
|
||||
}
|
||||
b.WriteString(title)
|
||||
if !sp.Available {
|
||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||
b.WriteString("Server power measurement unavailable.\n\n")
|
||||
} else {
|
||||
b.WriteString("| | Value |\n|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||||
spRows := [][]string{
|
||||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||||
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||||
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||||
}
|
||||
if sp.ReportingRatio > 0 {
|
||||
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
@@ -229,61 +431,72 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
}
|
||||
}
|
||||
|
||||
// ── Terminal charts (steady-state only) ───────────────────────────────────
|
||||
if len(charts) > 0 {
|
||||
b.WriteString("## Steady-State Charts\n\n")
|
||||
for _, chart := range charts {
|
||||
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||||
if content == "" {
|
||||
continue
|
||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||
if len(result.PSUIssues) > 0 {
|
||||
b.WriteString("## PSU Issues\n\n")
|
||||
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||||
for _, issue := range result.PSUIssues {
|
||||
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||
if cooling := result.Cooling; cooling != nil {
|
||||
b.WriteString("## Cooling\n\n")
|
||||
if cooling.Available {
|
||||
dutyAvg, dutyP95 := "N/A", "N/A"
|
||||
if cooling.FanDutyCycleAvailable {
|
||||
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||||
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Value"},
|
||||
[][]string{
|
||||
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||||
{"Average fan duty cycle", dutyAvg},
|
||||
{"P95 fan duty cycle", dutyP95},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||
}
|
||||
for _, note := range cooling.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(cooling.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Methodology ───────────────────────────────────────────────────────────
|
||||
b.WriteString("## Methodology\n\n")
|
||||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
|
||||
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
|
||||
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
|
||||
// ── Platform Scalability ──────────────────────────────────────────────────
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||
var scalRows [][]string
|
||||
for _, step := range result.PerformanceRampSteps {
|
||||
scalRows = append(scalRows, []string{
|
||||
fmt.Sprintf("%d", step.StepIndex),
|
||||
joinIndexList(step.GPUIndices),
|
||||
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||||
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||
b.WriteString("## Raw Files\n\n")
|
||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
|
||||
b.WriteString("- `gpu-*-warmup.log`\n")
|
||||
b.WriteString("- `gpu-*-steady.log`\n")
|
||||
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
|
||||
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
|
||||
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||||
if result.Interconnect != nil {
|
||||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
|
||||
// cooldown charts are not useful for human review).
|
||||
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||||
var charts []benchmarkReportChart
|
||||
for _, idx := range gpuIndices {
|
||||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil || len(raw) == 0 {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, benchmarkReportChart{
|
||||
Title: fmt.Sprintf("GPU %d — Steady State", idx),
|
||||
Content: string(raw),
|
||||
})
|
||||
}
|
||||
return charts
|
||||
}
|
||||
|
||||
func stripANSIEscapeSequences(raw string) string {
|
||||
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||||
}
|
||||
|
||||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||
// duration is unknown (0), raw seconds are shown instead.
|
||||
@@ -323,6 +536,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||
|
||||
75
audit/internal/platform/benchmark_table.go
Normal file
75
audit/internal/platform/benchmark_table.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// fmtMDTable renders a markdown table with column widths padded so the table
|
||||
// is readable as plain text without a markdown renderer.
|
||||
//
|
||||
// headers contains the column header strings.
|
||||
// rows contains data rows; each row must have the same number of cells as headers.
|
||||
// Cells with fewer entries than headers are treated as empty.
|
||||
func fmtMDTable(headers []string, rows [][]string) string {
|
||||
ncols := len(headers)
|
||||
if ncols == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute max width per column.
|
||||
widths := make([]int, ncols)
|
||||
for i, h := range headers {
|
||||
if len(h) > widths[i] {
|
||||
widths[i] = len(h)
|
||||
}
|
||||
}
|
||||
for _, row := range rows {
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
if len(cell) > widths[i] {
|
||||
widths[i] = len(cell)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header row.
|
||||
b.WriteByte('|')
|
||||
for i, h := range headers {
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(h)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Separator row.
|
||||
b.WriteByte('|')
|
||||
for i := range headers {
|
||||
b.WriteString(strings.Repeat("-", widths[i]+2))
|
||||
b.WriteByte('|')
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Data rows.
|
||||
for _, row := range rows {
|
||||
b.WriteByte('|')
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(cell)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
@@ -1,8 +1,13 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
@@ -16,17 +21,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
{
|
||||
name: "default",
|
||||
profile: "",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "stability",
|
||||
profile: "stability",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "overnight",
|
||||
profile: "overnight",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -41,6 +46,216 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if len(labels) != 5 || len(phases) != 5 {
|
||||
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
|
||||
}
|
||||
if basePhaseSec != 60 {
|
||||
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 300 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
|
||||
}
|
||||
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 300 {
|
||||
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 3600 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 3600 {
|
||||
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 14400 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
phases := []benchmarkPlannedPhase{
|
||||
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
|
||||
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
|
||||
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
|
||||
}
|
||||
rows := []GPUMetricRow{
|
||||
{ElapsedSec: 5},
|
||||
{ElapsedSec: 15},
|
||||
{ElapsedSec: 25},
|
||||
{ElapsedSec: 65},
|
||||
}
|
||||
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
|
||||
if len(got["fp8"]) != 1 {
|
||||
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
|
||||
}
|
||||
if len(got["fp16"]) != 1 {
|
||||
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
|
||||
}
|
||||
if len(got["mixed"]) != 2 {
|
||||
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
wantStatus string
|
||||
}{
|
||||
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
|
||||
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
|
||||
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
|
||||
if got != tc.wantStatus {
|
||||
t.Fatalf("status=%q want %q", got, tc.wantStatus)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
before := BenchmarkThrottleCounters{}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldExec := satExecCommand
|
||||
benchmarkGeteuid = func() int { return 1000 }
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
t.Fatalf("unexpected command: %s %v", name, args)
|
||||
return nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
satExecCommand = oldExec
|
||||
})
|
||||
|
||||
var logs []string
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||
logs = append(logs, line)
|
||||
})
|
||||
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||
t.Fatalf("logs=%q want substring %q", got, want)
|
||||
}
|
||||
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||
t.Fatalf("failed=%v want [0 2]", failed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "nvidia-smi")
|
||||
argsLog := filepath.Join(dir, "args.log")
|
||||
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldLookPath := satLookPath
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "nvidia-smi" {
|
||||
return script, nil
|
||||
}
|
||||
return exec.LookPath(file)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
satLookPath = oldLookPath
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||
if len(failed) != 0 {
|
||||
t.Fatalf("failed=%v want no failures", failed)
|
||||
}
|
||||
raw, err := os.ReadFile(argsLog)
|
||||
if err != nil {
|
||||
t.Fatalf("read args log: %v", err)
|
||||
}
|
||||
got := strings.Fields(string(raw))
|
||||
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||
t.Fatalf("args=%v want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -56,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
info benchmarkGPUInfo
|
||||
want int
|
||||
}{
|
||||
{
|
||||
name: "prefers default tdp over current derated limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 600,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 600,
|
||||
},
|
||||
{
|
||||
name: "caps default tdp to reported max limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 700,
|
||||
MaxPowerLimitW: 650,
|
||||
},
|
||||
want: 650,
|
||||
},
|
||||
{
|
||||
name: "falls back to current limit when default missing",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 525,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 525,
|
||||
},
|
||||
{
|
||||
name: "falls back to max limit when only that is known",
|
||||
info: benchmarkGPUInfo{
|
||||
MaxPowerLimitW: 575,
|
||||
},
|
||||
want: 575,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -65,8 +333,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
"[gpu 0] compute_capability=9.0",
|
||||
"[gpu 0] backend=cublasLt",
|
||||
"[gpu 0] duration_s=10",
|
||||
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
|
||||
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||
"[gpu 0] int8_tensor_iterations=80",
|
||||
"[gpu 0] fp16_tensor_iterations=200",
|
||||
"[gpu 0] fp8_e4m3_iterations=50",
|
||||
"[gpu 0] status=OK",
|
||||
@@ -79,15 +349,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
if got.ComputeCapability != "9.0" {
|
||||
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||
}
|
||||
if len(got.Profiles) != 2 {
|
||||
t.Fatalf("profiles=%d want 2", len(got.Profiles))
|
||||
if len(got.Profiles) != 3 {
|
||||
t.Fatalf("profiles=%d want 3", len(got.Profiles))
|
||||
}
|
||||
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||
}
|
||||
if got.Profiles[0].Category != "fp16_bf16" {
|
||||
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
|
||||
}
|
||||
if got.Profiles[1].Category != "fp8" {
|
||||
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||
}
|
||||
if got.Profiles[2].Category != "int8" {
|
||||
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
|
||||
}
|
||||
if got.Profiles[2].Weight != 0.25 {
|
||||
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
@@ -131,6 +410,13 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
DegradationReasons: []string{"power_capped"},
|
||||
},
|
||||
},
|
||||
Cooling: &BenchmarkCoolingSummary{
|
||||
Available: true,
|
||||
AvgFanRPM: 9200,
|
||||
FanDutyCycleAvailable: true,
|
||||
AvgFanDutyCyclePct: 47.5,
|
||||
P95FanDutyCyclePct: 62.0,
|
||||
},
|
||||
}
|
||||
|
||||
report := renderBenchmarkReport(result)
|
||||
@@ -140,6 +426,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
"1176.00",
|
||||
"fp16_tensor",
|
||||
"700.00",
|
||||
"Cooling",
|
||||
"Average fan duty cycle",
|
||||
"47.5%",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
@@ -147,34 +436,141 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
||||
func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
|
||||
report := renderBenchmarkReport(NvidiaBenchmarkResult{
|
||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||
OverallStatus: "OK",
|
||||
SelectedGPUIndices: []int{0},
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "full",
|
||||
},
|
||||
}, []benchmarkReportChart{
|
||||
{
|
||||
Title: "GPU 0 Steady State",
|
||||
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
|
||||
},
|
||||
})
|
||||
|
||||
for _, needle := range []string{
|
||||
"Steady-State Charts",
|
||||
"GPU 0 Steady State",
|
||||
"GPU 0 chart",
|
||||
"42┤───",
|
||||
"gpu-metrics.csv",
|
||||
"gpu-metrics.html",
|
||||
"gpu-burn.log",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
}
|
||||
}
|
||||
if strings.Contains(report, "\x1b[31m") {
|
||||
t.Fatalf("report should not contain ANSI escapes\n%s", report)
|
||||
}
|
||||
|
||||
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
|
||||
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
|
||||
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
|
||||
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
|
||||
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
PrecisionResults: []BenchmarkPrecisionResult{
|
||||
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
|
||||
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
})
|
||||
|
||||
if score.SyntheticScore != 100 {
|
||||
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
|
||||
}
|
||||
if score.MixedScore != 50 {
|
||||
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Min Power Limit : 200.00 W
|
||||
Max Power Limit : 600.00 W
|
||||
Default Power Limit : 575.00 W
|
||||
Current Power Limit : 560.00 W
|
||||
Clocks
|
||||
Graphics : 2422 MHz
|
||||
Memory : 12481 MHz
|
||||
Max Clocks
|
||||
Graphics : 2430 MHz
|
||||
SM : 2430 MHz
|
||||
Memory : 12481 MHz
|
||||
Video : 2107 MHz
|
||||
|
||||
GPU 00000000:4F:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Max Clocks
|
||||
Graphics : 2430 MHz
|
||||
Memory : 12481 MHz
|
||||
`)
|
||||
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0"},
|
||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].PowerLimitW != 560 {
|
||||
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Min Power Limit : 100.00 W
|
||||
Max Power Limit : 900.00 W
|
||||
Max Clocks
|
||||
Graphics : 9999 MHz
|
||||
Memory : 9999 MHz
|
||||
`)
|
||||
// Already populated — must not be overwritten.
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {
|
||||
Index: 0,
|
||||
BusID: "00000000:4E:00.0",
|
||||
MaxGraphicsClockMHz: 2430,
|
||||
MaxMemoryClockMHz: 12481,
|
||||
MinPowerLimitW: 200,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,37 +2,192 @@ package platform
|
||||
|
||||
import "time"
|
||||
|
||||
// BenchmarkHostConfig holds static CPU and memory configuration captured at
|
||||
// benchmark start. Useful for correlating results across runs on different hardware.
|
||||
type BenchmarkHostConfig struct {
|
||||
CPUModel string `json:"cpu_model,omitempty"`
|
||||
CPUSockets int `json:"cpu_sockets,omitempty"`
|
||||
CPUCores int `json:"cpu_cores,omitempty"`
|
||||
CPUThreads int `json:"cpu_threads,omitempty"`
|
||||
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
|
||||
// steady-state phase. High or unstable CPU load during a GPU benchmark may
|
||||
// indicate a competing workload or a CPU-bound driver bottleneck.
|
||||
type BenchmarkCPULoad struct {
|
||||
AvgPct float64 `json:"avg_pct"`
|
||||
MaxPct float64 `json:"max_pct"`
|
||||
P95Pct float64 `json:"p95_pct"`
|
||||
Samples int `json:"samples"`
|
||||
// Status is "ok", "high", or "unstable".
|
||||
Status string `json:"status"`
|
||||
Note string `json:"note,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCoolingSummary captures fan telemetry averaged across the full
|
||||
// benchmark run.
|
||||
type BenchmarkCoolingSummary struct {
|
||||
Available bool `json:"available"`
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaBenchmarkProfileStandard = "standard"
|
||||
NvidiaBenchmarkProfileStability = "stability"
|
||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||
)
|
||||
|
||||
const (
|
||||
BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
|
||||
BenchmarkPowerEngineTargetedPower = "targeted_power"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||
// re-measure from actual task logs and update the constants here.
|
||||
//
|
||||
// Sources:
|
||||
// - BenchmarkEstimatedPerfStandardSec: MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
|
||||
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||
// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
|
||||
const (
|
||||
// Performance Benchmark (bee-gpu-burn).
|
||||
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||
// Sequential per-GPU mode scales approximately linearly.
|
||||
BenchmarkEstimatedPerfStandardSec = 960 // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
|
||||
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||
|
||||
// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
|
||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||
)
|
||||
|
||||
type NvidiaBenchmarkOptions struct {
|
||||
Profile string
|
||||
SizeMB int
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
RunNCCL bool
|
||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||
ServerPowerSource string
|
||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||
RampTotal int // total number of ramp-up steps in this run
|
||||
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||
}
|
||||
|
||||
const (
|
||||
BenchmarkPowerSourceDCMI = "dcmi"
|
||||
BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
|
||||
)
|
||||
|
||||
type BenchmarkPowerAutotuneConfig struct {
|
||||
Version int `json:"version"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
SelectedSource string `json:"selected_source"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
Profile string `json:"profile,omitempty"`
|
||||
IdleDurationSec int `json:"idle_duration_sec,omitempty"`
|
||||
LoadDurationSec int `json:"load_duration_sec,omitempty"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||
Confidence float64 `json:"confidence,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type SystemPowerSourceDecision struct {
|
||||
Configured bool `json:"configured"`
|
||||
SelectedSource string `json:"selected_source,omitempty"`
|
||||
EffectiveSource string `json:"effective_source,omitempty"`
|
||||
Mode string `json:"mode,omitempty"` // autotuned, fallback, degraded
|
||||
Reason string `json:"reason,omitempty"`
|
||||
ConfiguredAt time.Time `json:"configured_at,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneResult struct {
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
Profile string `json:"profile,omitempty"`
|
||||
Status string `json:"status"`
|
||||
IdleDurationSec int `json:"idle_duration_sec"`
|
||||
LoadDurationSec int `json:"load_duration_sec"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec"`
|
||||
SelectedSource string `json:"selected_source,omitempty"`
|
||||
IdleValidationError string `json:"idle_validation_error,omitempty"`
|
||||
IdleValidation *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
|
||||
GPUPowerIdleW float64 `json:"gpu_power_idle_w,omitempty"`
|
||||
GPUPowerLoadW float64 `json:"gpu_power_load_w,omitempty"`
|
||||
Candidates []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
Config *BenchmarkPowerAutotuneConfig `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneValidation struct {
|
||||
Valid bool `json:"valid"`
|
||||
GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
|
||||
GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
|
||||
CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
|
||||
CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
|
||||
GPUSamples int `json:"gpu_samples,omitempty"`
|
||||
CPUSamples int `json:"cpu_samples,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneCandidate struct {
|
||||
Source string `json:"source"`
|
||||
IdleAvgW float64 `json:"idle_avg_w,omitempty"`
|
||||
LoadAvgW float64 `json:"load_avg_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
Samples int `json:"samples,omitempty"`
|
||||
RelativeError float64 `json:"relative_error,omitempty"`
|
||||
Confidence float64 `json:"confidence,omitempty"`
|
||||
Selected bool `json:"selected,omitempty"`
|
||||
Available bool `json:"available"`
|
||||
SelectionNotes string `json:"selection_notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
@@ -52,30 +207,51 @@ type BenchmarkNormalizationGPU struct {
|
||||
}
|
||||
|
||||
type BenchmarkGPUResult struct {
|
||||
Index int `json:"index"`
|
||||
UUID string `json:"uuid,omitempty"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
VBIOS string `json:"vbios,omitempty"`
|
||||
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||
Backend string `json:"backend,omitempty"`
|
||||
Status string `json:"status"`
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||
Scores BenchmarkScorecard `json:"scores"`
|
||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
Index int `json:"index"`
|
||||
UUID string `json:"uuid,omitempty"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
VBIOS string `json:"vbios,omitempty"`
|
||||
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||
Backend string `json:"backend,omitempty"`
|
||||
Status string `json:"status"`
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
|
||||
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
|
||||
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
|
||||
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
|
||||
// Fallback: 80°C.
|
||||
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
|
||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||
// dcgmi targeted_power calibration run before the main benchmark.
|
||||
// Used as the reference denominator for PowerSustainScore instead of
|
||||
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
|
||||
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
|
||||
PrecisionFailures []string `json:"precision_failures,omitempty"`
|
||||
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||
// ECC error delta accumulated over the full benchmark (all phases combined).
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||
Scores BenchmarkScorecard `json:"scores"`
|
||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning is non-empty when a thermal throttle event occurred with
|
||||
// a clock drop ≥20% while server fans were not at 100% duty cycle.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkTelemetrySummary struct {
|
||||
@@ -105,6 +281,18 @@ type BenchmarkThrottleCounters struct {
|
||||
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
||||
}
|
||||
|
||||
// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
|
||||
// Corrected = single-bit errors fixed by ECC (DRAM degradation).
|
||||
// Uncorrected = double-bit errors that could not be corrected (serious fault).
|
||||
// Both are volatile (since last driver reset), not persistent.
|
||||
type BenchmarkECCCounters struct {
|
||||
Corrected uint64 `json:"corrected"`
|
||||
Uncorrected uint64 `json:"uncorrected"`
|
||||
}
|
||||
|
||||
func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
|
||||
func (e BenchmarkECCCounters) IsZero() bool { return e.Corrected == 0 && e.Uncorrected == 0 }
|
||||
|
||||
type BenchmarkPrecisionResult struct {
|
||||
Name string `json:"name"`
|
||||
Category string `json:"category"`
|
||||
@@ -115,34 +303,124 @@ type BenchmarkPrecisionResult struct {
|
||||
K uint64 `json:"k,omitempty"`
|
||||
Iterations uint64 `json:"iterations,omitempty"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
// Weight is the fp32-equivalence factor for this precision category.
|
||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
|
||||
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||
Weight float64 `json:"weight,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkScorecard struct {
|
||||
ComputeScore float64 `json:"compute_score"`
|
||||
ComputeScore float64 `json:"compute_score"`
|
||||
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
|
||||
// steady phases (each precision ran alone, full GPU dedicated).
|
||||
SyntheticScore float64 `json:"synthetic_score,omitempty"`
|
||||
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
|
||||
// (all precisions competing simultaneously — closer to real workloads).
|
||||
MixedScore float64 `json:"mixed_score,omitempty"`
|
||||
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
|
||||
// sustains throughput under concurrent mixed-precision load.
|
||||
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// StabilityScore: fraction of steady-state time the GPU spent throttling
|
||||
// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
|
||||
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||
|
||||
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||
// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
|
||||
// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
|
||||
TempHeadroomC float64 `json:"temp_headroom_c"`
|
||||
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
// ServerQualityScore (0–100) reflects server infrastructure quality independent
|
||||
// of GPU model. Combines throttle time, power variance, and temp variance.
|
||||
// Use this to compare servers with the same GPU, or to flag a bad server
|
||||
// that throttles an otherwise fast GPU.
|
||||
ServerQualityScore float64 `json:"server_quality_score"`
|
||||
// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
|
||||
// A throttling GPU will score lower here automatically — no quality multiplier.
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||
// Comparable across throttle levels and GPU generations. Low value at normal
|
||||
// clocks indicates silicon degradation.
|
||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
||||
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
||||
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
||||
// over-reporting its power consumption.
|
||||
// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
|
||||
// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
|
||||
// so benchmark and audit data can be correlated by slot.
|
||||
type BenchmarkPSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN)
|
||||
OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power from multiple independent
|
||||
// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
|
||||
// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
|
||||
// covers only a subset of installed PSUs (partial coverage).
|
||||
//
|
||||
// Source legend:
|
||||
// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs
|
||||
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||
type BenchmarkServerPower struct {
|
||||
Available bool `json:"available"`
|
||||
IdleW float64 `json:"idle_w,omitempty"`
|
||||
LoadedW float64 `json:"loaded_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
Available bool `json:"available"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Mode string `json:"mode,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
|
||||
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||
PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"`
|
||||
PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
|
||||
|
||||
// PSU DC output sum — power delivered to server internals after conversion.
|
||||
PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"`
|
||||
PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
|
||||
|
||||
// Per-slot PSU readings at idle and at peak load.
|
||||
// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
|
||||
PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
|
||||
PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
|
||||
|
||||
// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
|
||||
// PCIe slot delivery only (excludes 16-pin connector power).
|
||||
GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
|
||||
|
||||
// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
|
||||
// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
|
||||
DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
|
||||
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||
// during a dedicated single-precision steady window. Because only one kernel
|
||||
// type runs at a time the PowerCVPct here is a genuine stability signal.
|
||||
type BenchmarkPrecisionSteadyPhase struct {
|
||||
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
|
||||
Status string `json:"status,omitempty"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
// ECC errors accumulated during this precision phase only.
|
||||
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
|
||||
// Any uncorrected = serious fault triggered by this precision workload.
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkInterconnectResult struct {
|
||||
@@ -156,3 +434,103 @@ type BenchmarkInterconnectResult struct {
|
||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||
// this server under full GPU load. Use for rack power planning.
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||
// actual wall-power draw as seen by the server's power supply.
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the power benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchGPU struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||
// stably with all other GPUs running simultaneously at their own limits.
|
||||
// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
|
||||
// additional derating.
|
||||
StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"`
|
||||
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// Telemetry holds the aggregated stats from the final converged calibration
|
||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||
// Fan state sampled at the end of single-card calibration.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||
NewGPUIndex int `json:"new_gpu_index"`
|
||||
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// PSU slot readings sampled at end of this ramp step.
|
||||
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
|
||||
// Fan state at end of this ramp step.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
|
||||
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
|
||||
}
|
||||
|
||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||
// scalability ramp-up phase of the performance benchmark.
|
||||
type NvidiaPerformanceRampStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||
ScalabilityPct float64 `json:"scalability_pct"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
@@ -13,14 +13,21 @@ import (
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||
Stage string `json:"stage,omitempty"`
|
||||
StageStartSec float64 `json:"stage_start_sec,omitempty"`
|
||||
StageEndSec float64 `json:"stage_end_sec,omitempty"`
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
@@ -141,14 +148,28 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||
for _, r := range rows {
|
||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
|
||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
|
||||
dutyAvail := 0
|
||||
if r.FanDutyCycleAvailable {
|
||||
dutyAvail = 1
|
||||
}
|
||||
dutyEstimated := 0
|
||||
if r.FanDutyCycleEstimated {
|
||||
dutyEstimated = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
|
||||
type gpuMetricStageSpan struct {
|
||||
Name string
|
||||
Start float64
|
||||
End float64
|
||||
}
|
||||
|
||||
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
// Group by GPU index preserving order.
|
||||
@@ -163,9 +184,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
stageSpans := buildGPUMetricStageSpans(rows)
|
||||
stageColorByName := make(map[string]string, len(stageSpans))
|
||||
for i, span := range stageSpans {
|
||||
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
|
||||
}
|
||||
|
||||
var legend strings.Builder
|
||||
if len(stageSpans) > 0 {
|
||||
legend.WriteString(`<div class="stage-legend">`)
|
||||
for _, span := range stageSpans {
|
||||
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
|
||||
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
|
||||
}
|
||||
legend.WriteString(`</div>`)
|
||||
}
|
||||
|
||||
var svgs strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
|
||||
svgs.WriteString("\n")
|
||||
}
|
||||
|
||||
@@ -175,21 +212,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
<meta charset="utf-8">
|
||||
<title>GPU Stress Test Metrics</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
|
||||
h1 { text-align: center; color: #333; margin: 0 0 8px; }
|
||||
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
|
||||
*{box-sizing:border-box}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
|
||||
.page{padding:24px}
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
|
||||
.card-body{padding:16px}
|
||||
h1{font-size:22px;margin:0 0 6px}
|
||||
p{color:var(--muted);font-size:13px;margin:0 0 16px}
|
||||
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
|
||||
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
|
||||
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
|
||||
.chart-block{margin-top:16px}
|
||||
</style>
|
||||
</head><body>
|
||||
<div class="page">
|
||||
<div class="card">
|
||||
<div class="card-head">GPU Stress Test Metrics</div>
|
||||
<div class="card-body">
|
||||
<h1>GPU Stress Test Metrics</h1>
|
||||
<p>Generated %s</p>
|
||||
%s
|
||||
</body></html>`, ts, svgs.String())
|
||||
<div class="chart-block">%s</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>`, ts, legend.String(), svgs.String())
|
||||
|
||||
return os.WriteFile(path, []byte(html), 0644)
|
||||
}
|
||||
|
||||
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
|
||||
// Layout
|
||||
const W, H = 960, 520
|
||||
const plotX1 = 120 // usage axis / chart left border
|
||||
@@ -284,6 +339,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Stage backgrounds
|
||||
for _, span := range stageSpans {
|
||||
x1 := xv(span.Start)
|
||||
x2 := xv(span.End)
|
||||
if x2 < x1 {
|
||||
x1, x2 = x2, x1
|
||||
}
|
||||
if x2-x1 < 1 {
|
||||
x2 = x1 + 1
|
||||
}
|
||||
color := stageColorByName[span.Name]
|
||||
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
|
||||
x1, plotY1, x2-x1, PH, color)
|
||||
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
|
||||
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
|
||||
}
|
||||
|
||||
// Chart border
|
||||
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
@@ -382,221 +454,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const (
|
||||
ansiAmber = "\033[38;5;214m"
|
||||
ansiReset = "\033[0m"
|
||||
)
|
||||
|
||||
const (
|
||||
termChartWidth = 70
|
||||
termChartHeight = 12
|
||||
)
|
||||
|
||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
||||
// Used in SAT stress-test logs.
|
||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
type seriesDef struct {
|
||||
caption string
|
||||
color string
|
||||
fn func(GPUMetricRow) float64
|
||||
}
|
||||
defs := []seriesDef{
|
||||
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
gr := gpuMap[gpuIdx]
|
||||
if len(gr) == 0 {
|
||||
continue
|
||||
}
|
||||
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
|
||||
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
|
||||
for _, d := range defs {
|
||||
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
|
||||
termChartHeight, termChartWidth))
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
|
||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
||||
if len(vals) == 0 {
|
||||
return caption + "\n"
|
||||
}
|
||||
|
||||
mn, mx := gpuMinMax(vals)
|
||||
if mn == mx {
|
||||
mx = mn + 1
|
||||
}
|
||||
|
||||
// Use the smaller of width or len(vals) to avoid stretching sparse data.
|
||||
w := width
|
||||
if len(vals) < w {
|
||||
w = len(vals)
|
||||
}
|
||||
data := gpuDownsample(vals, w)
|
||||
|
||||
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
|
||||
row := make([]int, w)
|
||||
for i, v := range data {
|
||||
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
|
||||
if r < 0 {
|
||||
r = 0
|
||||
}
|
||||
if r > height {
|
||||
r = height
|
||||
}
|
||||
row[i] = r
|
||||
}
|
||||
|
||||
// Fill the character grid.
|
||||
grid := make([][]rune, height+1)
|
||||
for i := range grid {
|
||||
grid[i] = make([]rune, w)
|
||||
for j := range grid[i] {
|
||||
grid[i][j] = ' '
|
||||
}
|
||||
}
|
||||
for x := 0; x < w; x++ {
|
||||
r := row[x]
|
||||
if x == 0 {
|
||||
grid[r][0] = '─'
|
||||
continue
|
||||
}
|
||||
p := row[x-1]
|
||||
switch {
|
||||
case r == p:
|
||||
grid[r][x] = '─'
|
||||
case r < p: // value went up (row index decreased toward top)
|
||||
grid[r][x] = '╭'
|
||||
grid[p][x] = '╯'
|
||||
for y := r + 1; y < p; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
default: // r > p, value went down
|
||||
grid[p][x] = '╮'
|
||||
grid[r][x] = '╰'
|
||||
for y := p + 1; y < r; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Y axis tick labels.
|
||||
ticks := gpuNiceTicks(mn, mx, height/2)
|
||||
tickAtRow := make(map[int]string)
|
||||
labelWidth := 4
|
||||
for _, t := range ticks {
|
||||
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
|
||||
if r < 0 || r > height {
|
||||
continue
|
||||
}
|
||||
s := gpuFormatTick(t)
|
||||
tickAtRow[r] = s
|
||||
if len(s) > labelWidth {
|
||||
labelWidth = len(s)
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for r := 0; r <= height; r++ {
|
||||
label := tickAtRow[r]
|
||||
fmt.Fprintf(&b, "%*s", labelWidth, label)
|
||||
switch {
|
||||
case label != "":
|
||||
b.WriteRune('┤')
|
||||
case r == height:
|
||||
b.WriteRune('┼')
|
||||
default:
|
||||
b.WriteRune('│')
|
||||
}
|
||||
b.WriteString(color)
|
||||
b.WriteString(string(grid[r]))
|
||||
b.WriteString(ansiReset)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
// Bottom axis.
|
||||
b.WriteString(strings.Repeat(" ", labelWidth))
|
||||
b.WriteRune('└')
|
||||
b.WriteString(strings.Repeat("─", w))
|
||||
b.WriteRune('\n')
|
||||
|
||||
// Caption centered under the chart.
|
||||
if caption != "" {
|
||||
total := labelWidth + 1 + w
|
||||
if pad := (total - len(caption)) / 2; pad > 0 {
|
||||
b.WriteString(strings.Repeat(" ", pad))
|
||||
}
|
||||
b.WriteString(caption)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
|
||||
v := make([]float64, len(rows))
|
||||
for i, r := range rows {
|
||||
v[i] = fn(r)
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
|
||||
func gpuDownsample(vals []float64, w int) []float64 {
|
||||
n := len(vals)
|
||||
if n == 0 {
|
||||
return make([]float64, w)
|
||||
}
|
||||
result := make([]float64, w)
|
||||
if n >= w {
|
||||
counts := make([]int, w)
|
||||
for i, v := range vals {
|
||||
bucket := i * w / n
|
||||
if bucket >= w {
|
||||
bucket = w - 1
|
||||
}
|
||||
result[bucket] += v
|
||||
counts[bucket]++
|
||||
}
|
||||
for i := range result {
|
||||
if counts[i] > 0 {
|
||||
result[i] /= float64(counts[i])
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Nearest-neighbour upsample.
|
||||
for i := range result {
|
||||
src := i * (n - 1) / (w - 1)
|
||||
if src >= n {
|
||||
src = n - 1
|
||||
}
|
||||
result[i] = vals[src]
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func gpuMinMax(vals []float64) (float64, float64) {
|
||||
if len(vals) == 0 {
|
||||
return 0, 1
|
||||
@@ -641,3 +498,57 @@ func gpuFormatTick(v float64) string {
|
||||
}
|
||||
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||
}
|
||||
|
||||
var gpuMetricStagePalette = []string{
|
||||
"#d95c5c",
|
||||
"#2185d0",
|
||||
"#21ba45",
|
||||
"#f2c037",
|
||||
"#6435c9",
|
||||
"#00b5ad",
|
||||
"#a5673f",
|
||||
}
|
||||
|
||||
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
|
||||
var spans []gpuMetricStageSpan
|
||||
for _, row := range rows {
|
||||
name := strings.TrimSpace(row.Stage)
|
||||
if name == "" {
|
||||
name = "run"
|
||||
}
|
||||
start := row.StageStartSec
|
||||
end := row.StageEndSec
|
||||
if end <= start {
|
||||
start = row.ElapsedSec
|
||||
end = row.ElapsedSec
|
||||
}
|
||||
if len(spans) == 0 || spans[len(spans)-1].Name != name {
|
||||
spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
|
||||
continue
|
||||
}
|
||||
if start < spans[len(spans)-1].Start {
|
||||
spans[len(spans)-1].Start = start
|
||||
}
|
||||
if end > spans[len(spans)-1].End {
|
||||
spans[len(spans)-1].End = end
|
||||
}
|
||||
}
|
||||
for i := range spans {
|
||||
if spans[i].End <= spans[i].Start {
|
||||
spans[i].End = spans[i].Start + 1
|
||||
}
|
||||
}
|
||||
return spans
|
||||
}
|
||||
|
||||
var gpuHTMLReplacer = strings.NewReplacer(
|
||||
"&", "&",
|
||||
"<", "<",
|
||||
">", ">",
|
||||
`"`, """,
|
||||
"'", "'",
|
||||
)
|
||||
|
||||
func gpuHTMLEscape(s string) string {
|
||||
return gpuHTMLReplacer.Replace(s)
|
||||
}
|
||||
|
||||
65
audit/internal/platform/gpu_metrics_test.go
Normal file
65
audit/internal/platform/gpu_metrics_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "gpu-metrics.csv")
|
||||
rows := []GPUMetricRow{
|
||||
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
|
||||
}
|
||||
if err := WriteGPUMetricsCSV(path, rows); err != nil {
|
||||
t.Fatalf("WriteGPUMetricsCSV: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
for _, needle := range []string{
|
||||
"stage,elapsed_sec,gpu_index",
|
||||
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("csv missing %q\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "gpu-metrics.html")
|
||||
rows := []GPUMetricRow{
|
||||
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
|
||||
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
|
||||
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
|
||||
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
|
||||
}
|
||||
if err := WriteGPUMetricsHTML(path, rows); err != nil {
|
||||
t.Fatalf("WriteGPUMetricsHTML: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
for _, needle := range []string{
|
||||
"stage-legend",
|
||||
"baseline",
|
||||
"steady-fp16",
|
||||
"GPU Stress Test Metrics",
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("html missing %q\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,12 +11,11 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const installToRAMDir = "/dev/shm/bee-live"
|
||||
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
if fsType == "" {
|
||||
return toramActive()
|
||||
}
|
||||
return strings.EqualFold(fsType, "tmpfs")
|
||||
return s.LiveMediaRAMState().InRAM
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
@@ -48,42 +47,164 @@ func (s *System) LiveBootSource() LiveBootSource {
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
|
||||
return evaluateLiveMediaRAMState(
|
||||
s.LiveBootSource(),
|
||||
toramActive(),
|
||||
globPaths("/run/live/medium/live/*.squashfs"),
|
||||
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
|
||||
)
|
||||
}
|
||||
|
||||
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
|
||||
state := LiveMediaRAMState{
|
||||
LiveBootSource: status,
|
||||
ToramActive: toram,
|
||||
CopyPresent: len(copiedSquashfs) > 0,
|
||||
}
|
||||
if status.InRAM {
|
||||
state.State = "in_ram"
|
||||
state.Status = "ok"
|
||||
state.CopyComplete = true
|
||||
state.Message = "Running from RAM — installation media can be safely disconnected."
|
||||
return state
|
||||
}
|
||||
|
||||
expected := pathBaseSet(sourceSquashfs)
|
||||
copied := pathBaseSet(copiedSquashfs)
|
||||
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
|
||||
|
||||
switch {
|
||||
case state.CopyComplete:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
|
||||
case state.CopyPresent:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
|
||||
case toram:
|
||||
state.State = "toram_failed"
|
||||
state.Status = "failed"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
|
||||
default:
|
||||
state.State = "not_in_ram"
|
||||
state.Status = "warning"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
|
||||
}
|
||||
return state
|
||||
}
|
||||
|
||||
func globPaths(pattern string) []string {
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
return matches
|
||||
}
|
||||
|
||||
func pathBaseSet(paths []string) map[string]struct{} {
|
||||
out := make(map[string]struct{}, len(paths))
|
||||
for _, path := range paths {
|
||||
base := strings.TrimSpace(filepath.Base(path))
|
||||
if base != "" {
|
||||
out[base] = struct{}{}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func setContainsAll(have, want map[string]struct{}) bool {
|
||||
if len(want) == 0 {
|
||||
return false
|
||||
}
|
||||
for name := range want {
|
||||
if _, ok := have[name]; !ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
|
||||
log := func(msg string) {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
}
|
||||
|
||||
if s.IsLiveMediaInRAM() {
|
||||
state := s.LiveMediaRAMState()
|
||||
if state.InRAM {
|
||||
log("Already running from RAM — installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
if err != nil || len(squashfsFiles) == 0 {
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||
}
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
dstDir := installToRAMDir
|
||||
|
||||
// If the source medium is unavailable, check whether a previous run already
|
||||
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||
// directly to the loop-rebind / bind-mount steps.
|
||||
if !sourceAvailable {
|
||||
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(copiedFiles) > 0 {
|
||||
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||
// Proceed to rebind with the already-copied files.
|
||||
for _, dst := range copiedFiles {
|
||||
base := filepath.Base(dst)
|
||||
// Re-associate the loop device that was originally backed by the
|
||||
// source file (now gone); find it by the old source path pattern.
|
||||
srcGuess := "/run/live/medium/live/" + base
|
||||
loopDev, lerr := findLoopForFile(srcGuess)
|
||||
if lerr != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||
continue
|
||||
}
|
||||
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
goto bindMedium
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||
}
|
||||
|
||||
dstDir := "/dev/shm/bee-live"
|
||||
{
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
}
|
||||
|
||||
if state.CopyPresent {
|
||||
log("Removing stale partial RAM copy before retry...")
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
return
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
log("Removed incomplete RAM copy.")
|
||||
}()
|
||||
|
||||
for _, sf := range squashfsFiles {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -109,6 +230,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
}
|
||||
}
|
||||
|
||||
bindMedium:
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
@@ -198,6 +320,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
defer out.Close()
|
||||
total := fi.Size()
|
||||
var copied int64
|
||||
var lastLogged int64
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -209,7 +332,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
return werr
|
||||
}
|
||||
copied += int64(n)
|
||||
if logFunc != nil && total > 0 {
|
||||
if shouldLogCopyProgress(copied, total, lastLogged) {
|
||||
lastLogged = copied
|
||||
pct := int(float64(copied) / float64(total) * 100)
|
||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||
}
|
||||
@@ -224,6 +348,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
|
||||
if total <= 0 || copied <= 0 {
|
||||
return false
|
||||
}
|
||||
if copied >= total {
|
||||
return copied > lastLogged
|
||||
}
|
||||
if copied < copyProgressLogStep {
|
||||
return false
|
||||
}
|
||||
return copied-lastLogged >= copyProgressLogStep
|
||||
}
|
||||
|
||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||
if ctx.Err() != nil {
|
||||
|
||||
@@ -58,3 +58,69 @@ func TestDescribeLiveBootSource(t *testing.T) {
|
||||
t.Fatalf("got %q want /run/live/medium", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("in_ram", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
|
||||
false,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("partial_copy_after_cancel", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
false,
|
||||
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
|
||||
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
|
||||
)
|
||||
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
if state.CopyComplete {
|
||||
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("toram_failed", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
true,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestShouldLogCopyProgress(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
total := int64(250 * 1024 * 1024)
|
||||
step := int64(100 * 1024 * 1024)
|
||||
|
||||
if shouldLogCopyProgress(step-1, total, 0) {
|
||||
t.Fatal("progress logged too early")
|
||||
}
|
||||
if !shouldLogCopyProgress(step, total, 0) {
|
||||
t.Fatal("expected log at first 100MB boundary")
|
||||
}
|
||||
if shouldLogCopyProgress(step+16*1024*1024, total, step) {
|
||||
t.Fatal("progress logged again before next 100MB")
|
||||
}
|
||||
if !shouldLogCopyProgress(2*step, total, step) {
|
||||
t.Fatal("expected log at second 100MB boundary")
|
||||
}
|
||||
if !shouldLogCopyProgress(total, total, 2*step) {
|
||||
t.Fatal("expected final completion log")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||
@@ -30,7 +33,12 @@ type KilledProcess struct {
|
||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||
// SIGKILL to each one found. It returns a list of killed processes.
|
||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||
// The scan runs under a 5-second deadline to avoid blocking if the process
|
||||
// table is very large (e.g. after a stress test with thousands of children).
|
||||
func KillTestWorkers() []KilledProcess {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
entries, err := os.ReadDir("/proc")
|
||||
if err != nil {
|
||||
return nil
|
||||
@@ -38,6 +46,13 @@ func KillTestWorkers() []KilledProcess {
|
||||
|
||||
var killed []KilledProcess
|
||||
for _, e := range entries {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
|
||||
return killed
|
||||
default:
|
||||
}
|
||||
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bee/audit/internal/collector"
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
@@ -14,13 +16,24 @@ import (
|
||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||
// collected for the web UI metrics page.
|
||||
type LiveMetricSample struct {
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
PowerSource string `json:"power_source,omitempty"`
|
||||
PowerMode string `json:"power_mode,omitempty"`
|
||||
PowerReason string `json:"power_reason,omitempty"`
|
||||
PSUs []PSUReading `json:"psus,omitempty"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// PSUReading is a per-slot power supply input power reading.
|
||||
type PSUReading struct {
|
||||
Slot int `json:"slot"`
|
||||
Name string `json:"name"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
}
|
||||
|
||||
// TempReading is a named temperature sensor value.
|
||||
@@ -54,8 +67,17 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
}
|
||||
}
|
||||
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||
s.PSUs = samplePSUPower()
|
||||
|
||||
// System power: use the global autotune-selected source when configured,
|
||||
// otherwise fall back to the historical heuristic and mark the mode.
|
||||
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
|
||||
s.PowerW = powerW
|
||||
s.PowerSource = decision.EffectiveSource
|
||||
s.PowerMode = decision.Mode
|
||||
s.PowerReason = decision.Reason
|
||||
}
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
@@ -326,3 +348,46 @@ func compactAmbientTempName(chip, name string) string {
|
||||
}
|
||||
return chip + " / " + name
|
||||
}
|
||||
|
||||
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||
// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
|
||||
// vendors where PSU sensors may not carry entity ID "10.N".
|
||||
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||
func samplePSUPower() []PSUReading {
|
||||
out, err := exec.Command("ipmitool", "sdr").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
slots := collector.PSUSlotsFromSDR(string(out))
|
||||
if len(slots) == 0 {
|
||||
return nil
|
||||
}
|
||||
// Collect slot keys and sort for stable output.
|
||||
keys := make([]int, 0, len(slots))
|
||||
for k := range slots {
|
||||
n, err := strconv.Atoi(k)
|
||||
if err == nil {
|
||||
keys = append(keys, n)
|
||||
}
|
||||
}
|
||||
sort.Ints(keys)
|
||||
psus := make([]PSUReading, 0, len(keys))
|
||||
for _, k := range keys {
|
||||
entry := slots[strconv.Itoa(k)]
|
||||
// Prefer AC input power; fall back to DC output power.
|
||||
var w float64
|
||||
if entry.InputW != nil && *entry.InputW > 0 {
|
||||
w = *entry.InputW
|
||||
} else if entry.OutputW != nil && *entry.OutputW > 0 {
|
||||
w = *entry.OutputW
|
||||
}
|
||||
if w <= 0 {
|
||||
continue
|
||||
}
|
||||
psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
|
||||
}
|
||||
if len(psus) == 0 {
|
||||
return nil
|
||||
}
|
||||
return psus
|
||||
}
|
||||
|
||||
30
audit/internal/platform/nvidia_recover.go
Normal file
30
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"time"
|
||||
)
|
||||
|
||||
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||
|
||||
func runNvidiaRecover(args ...string) (string, error) {
|
||||
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||
cmdArgs := []string{
|
||||
"systemd-run",
|
||||
"--quiet",
|
||||
"--pipe",
|
||||
"--wait",
|
||||
"--collect",
|
||||
"--service-type=oneshot",
|
||||
"--unit", unit,
|
||||
}
|
||||
cmdArgs = append(cmdArgs, helperArgs...)
|
||||
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
|
||||
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
// Pack tar.gz
|
||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||
return "", fmt.Errorf("pack archive: %w", err)
|
||||
}
|
||||
_ = os.RemoveAll(runDir)
|
||||
return archivePath, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
@@ -27,6 +28,8 @@ var runtimeTrackedServices = []string{
|
||||
"bee-audit",
|
||||
"bee-web",
|
||||
"bee-sshsetup",
|
||||
"nvidia-dcgm",
|
||||
"nvidia-fabricmanager",
|
||||
}
|
||||
|
||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
@@ -114,6 +117,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
}
|
||||
|
||||
s.collectGPURuntimeHealth(vendor, &health)
|
||||
s.collectToRAMHealth(&health)
|
||||
s.collectUSBExportHealth(&health)
|
||||
|
||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||
health.Status = "PARTIAL"
|
||||
@@ -168,6 +173,99 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||
return ToolStatus{Name: display}
|
||||
}
|
||||
|
||||
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
|
||||
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
|
||||
// incomplete RAM copy exists but runtime still depends on the boot medium,
|
||||
// "failed" = toram was requested but medium is not in RAM.
|
||||
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||
state := s.LiveMediaRAMState()
|
||||
health.ToRAMStatus = state.Status
|
||||
switch state.Status {
|
||||
case "ok":
|
||||
return
|
||||
case "failed":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_failed",
|
||||
Severity: "warning",
|
||||
Description: state.Message,
|
||||
})
|
||||
case "partial":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_partial",
|
||||
Severity: "warning",
|
||||
Description: state.Message,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
|
||||
// suitable for log export. Sets USBExportPath to the first match found.
|
||||
func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
|
||||
health.USBExportPath = findUSBExportMount()
|
||||
}
|
||||
|
||||
// findUSBExportMount returns the mount point of the first writable USB filesystem
|
||||
// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
|
||||
// has USB transport. Returns "" if none found.
|
||||
func findUSBExportMount() string {
|
||||
f, err := os.Open("/proc/mounts")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// fs types that are expected on USB export drives
|
||||
exportFSTypes := map[string]bool{
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"fuseblk": true,
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
// fields: device mountpoint fstype options dump pass
|
||||
fields := strings.Fields(scanner.Text())
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||
if !exportFSTypes[strings.ToLower(fsType)] {
|
||||
continue
|
||||
}
|
||||
// Skip read-only mounts
|
||||
opts := strings.Split(options, ",")
|
||||
readOnly := false
|
||||
for _, o := range opts {
|
||||
if strings.TrimSpace(o) == "ro" {
|
||||
readOnly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if readOnly {
|
||||
continue
|
||||
}
|
||||
// Check USB transport via lsblk on the device (or its parent disk for partitions).
|
||||
if !strings.HasPrefix(device, "/dev/") {
|
||||
continue
|
||||
}
|
||||
checkDev := device
|
||||
// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
|
||||
// Strip trailing partition digits to get the parent disk name.
|
||||
if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
|
||||
checkDev = trimmed
|
||||
}
|
||||
if blockDeviceTransport(checkDev) == "usb" {
|
||||
return mountPoint
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||
lsmodText := commandText("lsmod")
|
||||
|
||||
|
||||
@@ -20,6 +20,54 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||
// production logs in _benchmark/_v8/.
|
||||
//
|
||||
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||
// from actual task logs and update the matching constant here.
|
||||
//
|
||||
// Sources:
|
||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||
// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||
const (
|
||||
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||
SATEstimatedCPUValidateSec = 65
|
||||
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||
SATEstimatedCPUStressSec = 1800
|
||||
|
||||
// RAM: memtester 256 MB / 1 pass.
|
||||
SATEstimatedMemoryValidateSec = 70
|
||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||
SATEstimatedMemoryStressSec = 140
|
||||
|
||||
// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
|
||||
SATEstimatedNvidiaGPUValidateSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
|
||||
SATEstimatedNvidiaGPUStressSec = 450
|
||||
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaTargetedStressSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaTargetedPowerSec = 350
|
||||
|
||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||
SATEstimatedNvidiaPulseTestSec = 5000
|
||||
|
||||
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaInterconnectSec = 300
|
||||
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||
SATEstimatedNvidiaBandwidthSec = 2700
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
@@ -108,15 +156,15 @@ type nvidiaGPUHealth struct {
|
||||
}
|
||||
|
||||
type nvidiaGPUStatusFile struct {
|
||||
Index int
|
||||
Name string
|
||||
RunStatus string
|
||||
Reason string
|
||||
Health string
|
||||
HealthRaw string
|
||||
Observed bool
|
||||
Selected bool
|
||||
FailingJob string
|
||||
Index int
|
||||
Name string
|
||||
RunStatus string
|
||||
Reason string
|
||||
Health string
|
||||
HealthRaw string
|
||||
Observed bool
|
||||
Selected bool
|
||||
FailingJob string
|
||||
}
|
||||
|
||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||
@@ -359,19 +407,21 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||
if len(raw) == 0 && err == nil {
|
||||
raw = []byte("GPU reset completed.\n")
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return string(raw), err
|
||||
return out, err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
gpuCount := len(selected)
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
@@ -380,18 +430,40 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
var (
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if len(selected) > 1 {
|
||||
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||||
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||||
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||||
// of CUDA_VISIBLE_DEVICES.
|
||||
stagger := staggerSec
|
||||
if stagger < 0 {
|
||||
stagger = 0
|
||||
}
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(stagger),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
@@ -399,7 +471,7 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: nvidiaVisibleDevicesEnv(selected),
|
||||
env: profEnv,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
@@ -412,6 +484,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -429,6 +508,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -446,6 +532,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -538,9 +631,19 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
|
||||
// intentionally conservative enough for healthy systems while avoiding the
|
||||
// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
|
||||
timeoutSec := sizeMB*passes*20/100 + 60
|
||||
if timeoutSec < 180 {
|
||||
timeoutSec = 180
|
||||
}
|
||||
if timeoutSec > 900 {
|
||||
timeoutSec = 900
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
@@ -648,11 +751,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
type satJob struct {
|
||||
@@ -838,11 +937,7 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
}
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||
@@ -905,7 +1000,7 @@ func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPU
|
||||
entry.Health = "UNKNOWN"
|
||||
}
|
||||
if entry.Name == "" {
|
||||
entry.Name = "unknown"
|
||||
entry.Name = "Unknown GPU"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||
@@ -1376,8 +1471,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
||||
if len(metricRows) > 0 {
|
||||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||||
chart := RenderGPUTerminalChart(metricRows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
||||
}
|
||||
|
||||
return out, err
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -42,27 +43,56 @@ type GPUStressMetric struct {
|
||||
|
||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||
type FanStressRow struct {
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64
|
||||
SysPowerSource string
|
||||
SysPowerMode string
|
||||
}
|
||||
|
||||
type cachedPowerReading struct {
|
||||
Value float64
|
||||
Source string
|
||||
Mode string
|
||||
Reason string
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type fanObservationState struct {
|
||||
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||
}
|
||||
|
||||
type fanPeakCandidate struct {
|
||||
FirstSeen time.Time
|
||||
RPM float64
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
fanObservationMu sync.Mutex
|
||||
fanObservation fanObservationState
|
||||
fanObservationInit bool
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||
|
||||
const fanObservationMinPeakHold = time.Second
|
||||
|
||||
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||
if rpm <= 0 {
|
||||
return 0
|
||||
}
|
||||
return math.Ceil(rpm/1000.0) * 1000.0
|
||||
}
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
@@ -223,11 +253,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
@@ -257,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
|
||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||
row.Fans, _ = sampleFanSpeeds()
|
||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||
row.SysPowerW = sampleSystemPower()
|
||||
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
|
||||
return row
|
||||
}
|
||||
|
||||
@@ -314,11 +340,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err == nil {
|
||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
}
|
||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||
if len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
if err != nil {
|
||||
@@ -327,6 +355,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
return nil, sensorsErr
|
||||
}
|
||||
|
||||
func loadFanObservationLocked() {
|
||||
if fanObservationInit {
|
||||
return
|
||||
}
|
||||
fanObservationInit = true
|
||||
fanObservation.MaxRPM = make(map[string]float64)
|
||||
raw, err := os.ReadFile(fanObservationStatePath)
|
||||
if err != nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
var persisted fanObservationState
|
||||
if json.Unmarshal(raw, &persisted) != nil {
|
||||
return
|
||||
}
|
||||
for name, rpm := range persisted.MaxRPM {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
fanObservation.MaxRPM[name] = rpm
|
||||
}
|
||||
}
|
||||
|
||||
func saveFanObservationLocked() {
|
||||
if len(fanObservation.MaxRPM) == 0 {
|
||||
return
|
||||
}
|
||||
dir := filepath.Dir(fanObservationStatePath)
|
||||
if dir == "" || dir == "." {
|
||||
dir = "/var/log/bee-sat"
|
||||
}
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return
|
||||
}
|
||||
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||
}
|
||||
|
||||
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||
if len(fans) == 0 {
|
||||
return
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
changed := false
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
currentMax := fanObservation.MaxRPM[name]
|
||||
if fan.RPM <= currentMax {
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if cand, ok := fanPeakCandidates[name]; ok {
|
||||
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||
newMax := math.Max(cand.RPM, fan.RPM)
|
||||
if newMax > currentMax {
|
||||
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||
changed = true
|
||||
}
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if fan.RPM > cand.RPM {
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||
}
|
||||
continue
|
||||
}
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||
}
|
||||
if changed {
|
||||
saveFanObservationLocked()
|
||||
}
|
||||
}
|
||||
|
||||
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
var samples []float64
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
maxRPM := fanObservation.MaxRPM[name]
|
||||
if maxRPM <= 0 {
|
||||
continue
|
||||
}
|
||||
pct := fan.RPM / maxRPM * 100.0
|
||||
if pct > 100 {
|
||||
pct = 100
|
||||
}
|
||||
if pct < 0 {
|
||||
pct = 0
|
||||
}
|
||||
samples = append(samples, pct)
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
//
|
||||
@@ -430,6 +571,116 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
return fans, nil
|
||||
}
|
||||
|
||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||
// Returns the average duty cycle across all exposed PWM controls.
|
||||
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
fans, fanErr := sampleFanSpeeds()
|
||||
if fanErr != nil {
|
||||
return 0, false, false
|
||||
}
|
||||
return sampleFanDutyCyclePctFromFans(fans)
|
||||
}
|
||||
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||
return pct, ok, false
|
||||
}
|
||||
|
||||
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false, false
|
||||
}
|
||||
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||
return pct, true, true
|
||||
}
|
||||
return 0, false, false
|
||||
}
|
||||
|
||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
return 0, false
|
||||
}
|
||||
var samples []float64
|
||||
for _, features := range doc {
|
||||
for name, feature := range features {
|
||||
if strings.EqualFold(name, "Adapter") {
|
||||
continue
|
||||
}
|
||||
featureMap, ok := feature.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if duty, ok := firstFanDutyValue(name, featureMap); ok {
|
||||
samples = append(samples, duty)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
|
||||
featureName = strings.ToLower(strings.TrimSpace(featureName))
|
||||
if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
|
||||
return 0, false
|
||||
}
|
||||
if strings.Contains(featureName, "pwm") {
|
||||
for _, key := range []string{"input", "value", "current"} {
|
||||
if value, ok := feature[key]; ok {
|
||||
if duty, parsed := parseFanDutyValue(value); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "pwm") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
|
||||
continue
|
||||
}
|
||||
if duty, parsed := parseFanDutyValue(feature[key]); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func parseFanDutyValue(value any) (float64, bool) {
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return normalizePWMAsDutyPct(v)
|
||||
case string:
|
||||
if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
|
||||
return normalizePWMAsDutyPct(f)
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func normalizePWMAsDutyPct(raw float64) (float64, bool) {
|
||||
if raw < 0 {
|
||||
return 0, false
|
||||
}
|
||||
if raw <= 100 {
|
||||
return raw, true
|
||||
}
|
||||
if raw <= 255 {
|
||||
return raw / 255.0 * 100.0, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
@@ -517,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
// sampleSystemPowerResolved reads system power via the global autotune source,
|
||||
// falling back to the historical heuristic before autotune or when degraded.
|
||||
func sampleSystemPowerResolved() (float64, string, string) {
|
||||
now := time.Now()
|
||||
current := 0.0
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err == nil {
|
||||
current = parseDCMIPowerReading(string(out))
|
||||
}
|
||||
current, decision, err := SampleSystemPowerResolved("")
|
||||
systemPowerCacheMu.Lock()
|
||||
defer systemPowerCacheMu.Unlock()
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||
if err != nil {
|
||||
current = 0
|
||||
}
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
|
||||
systemPowerCache = updated
|
||||
return value
|
||||
return value, updated.Source, updated.Mode
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
@@ -552,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
|
||||
if current > 0 {
|
||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
|
||||
return current, cache
|
||||
}
|
||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -29,6 +30,74 @@ func TestFirstFanInputValue(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||
raw := []byte(`{
|
||||
"chip0": {
|
||||
"fan1": {"input": 9000},
|
||||
"pwm1": {"input": 128},
|
||||
"pwm1_enable": {"input": 1}
|
||||
},
|
||||
"chip1": {
|
||||
"pwm2": {"input": 64}
|
||||
}
|
||||
}`)
|
||||
|
||||
got, ok := parseFanDutyCyclePctSensorsJSON(raw)
|
||||
if !ok {
|
||||
t.Fatalf("expected duty cycle telemetry to be parsed")
|
||||
}
|
||||
if got < 57 || got > 58 {
|
||||
t.Fatalf("got=%v want ~57.1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldPath := fanObservationStatePath
|
||||
oldState := fanObservation
|
||||
oldInit := fanObservationInit
|
||||
oldCandidates := fanPeakCandidates
|
||||
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
t.Cleanup(func() {
|
||||
fanObservationStatePath = oldPath
|
||||
fanObservation = oldState
|
||||
fanObservationInit = oldInit
|
||||
fanPeakCandidates = oldCandidates
|
||||
})
|
||||
|
||||
start := time.Unix(100, 0)
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||
t.Fatalf("single-sample spike should not establish observed max")
|
||||
}
|
||||
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||
|
||||
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("got=%v want ~43.3", got)
|
||||
}
|
||||
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
@@ -43,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
now := time.Now()
|
||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
|
||||
if got != 480 {
|
||||
t.Fatalf("got=%v want cached 480", got)
|
||||
}
|
||||
@@ -51,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
|
||||
if got != 530 {
|
||||
t.Fatalf("got=%v want 530", got)
|
||||
}
|
||||
@@ -60,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
}
|
||||
|
||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
|
||||
if got != 0 {
|
||||
t.Fatalf("expired cache returned %v want 0", got)
|
||||
}
|
||||
|
||||
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 2 {
|
||||
|
||||
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return runNvidiaRecover("restart-drivers")
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
|
||||
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||
{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
|
||||
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
|
||||
@@ -9,6 +9,17 @@ type LiveBootSource struct {
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type LiveMediaRAMState struct {
|
||||
LiveBootSource
|
||||
State string `json:"state"`
|
||||
Status string `json:"status"`
|
||||
ToramActive bool `json:"toram_active,omitempty"`
|
||||
CopyPresent bool `json:"copy_present,omitempty"`
|
||||
CopyComplete bool `json:"copy_complete,omitempty"`
|
||||
CanStartCopy bool `json:"can_start_copy,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
@@ -70,6 +81,7 @@ type NvidiaStressOptions struct {
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
StaggerSeconds int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
|
||||
@@ -15,13 +15,17 @@ type HardwareIngestRequest struct {
|
||||
}
|
||||
|
||||
type RuntimeHealth struct {
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||
@@ -183,6 +187,13 @@ type HardwarePCIeDevice struct {
|
||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
@@ -35,6 +36,16 @@ var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, err
|
||||
return a.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
const (
|
||||
taskPriorityBenchmark = 10
|
||||
taskPriorityBurn = 20
|
||||
taskPriorityValidateStress = 30
|
||||
taskPriorityValidate = 40
|
||||
taskPriorityAudit = 50
|
||||
taskPriorityInstallToRAM = 60
|
||||
taskPriorityInstall = 70
|
||||
)
|
||||
|
||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||
|
||||
var jobCounter atomic.Uint64
|
||||
@@ -99,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||
|
||||
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
|
||||
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||
"nvidia-bandwidth", "nvidia-stress":
|
||||
return true
|
||||
@@ -108,6 +119,30 @@ func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func defaultTaskPriority(target string, params taskParams) int {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "install":
|
||||
return taskPriorityInstall
|
||||
case "install-to-ram":
|
||||
return taskPriorityInstallToRAM
|
||||
case "audit":
|
||||
return taskPriorityAudit
|
||||
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||
return taskPriorityBenchmark
|
||||
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
||||
return taskPriorityBurn
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
|
||||
"amd", "amd-mem", "amd-bandwidth":
|
||||
if params.StressMode {
|
||||
return taskPriorityValidateStress
|
||||
}
|
||||
return taskPriorityValidate
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||
if len(gpus) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||
@@ -209,6 +244,14 @@ func joinTaskIndices(indices []int) string {
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func formatGPUIndexList(indices []int) string {
|
||||
parts := make([]string, len(indices))
|
||||
for i, idx := range indices {
|
||||
parts[i] = strconv.Itoa(idx)
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||
baseName = strings.TrimSpace(baseName)
|
||||
selectionLabel = strings.TrimSpace(selectionLabel)
|
||||
@@ -449,6 +492,7 @@ func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) {
|
||||
ID: newJobID("audit"),
|
||||
Name: "Audit",
|
||||
Target: "audit",
|
||||
Priority: defaultTaskPriority("audit", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
@@ -487,6 +531,8 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||
ParallelGPUs bool `json:"parallel_gpus"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
@@ -508,12 +554,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
StaggerGPUStart: body.StaggerGPUStart,
|
||||
ParallelGPUs: body.ParallelGPUs,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
@@ -525,57 +573,208 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||
RampUp *bool `json:"ramp_up"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
runNCCL := true
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
parallelGPUs := false
|
||||
if body.ParallelGPUs != nil {
|
||||
parallelGPUs = *body.ParallelGPUs
|
||||
}
|
||||
rampUp := false
|
||||
if body.RampUp != nil {
|
||||
rampUp = *body.RampUp
|
||||
}
|
||||
// Build a descriptive base name that includes profile and mode so the task
|
||||
// list is self-explanatory without opening individual task detail pages.
|
||||
profile := strings.TrimSpace(body.Profile)
|
||||
if profile == "" {
|
||||
profile = "standard"
|
||||
}
|
||||
name := taskDisplayName(target, "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
// Append profile tag.
|
||||
name = fmt.Sprintf("%s · %s", name, profile)
|
||||
|
||||
if target == "nvidia-bench-power" && parallelGPUs {
|
||||
writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
|
||||
return
|
||||
}
|
||||
|
||||
if rampUp && len(body.GPUIndices) > 1 {
|
||||
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
|
||||
// in Phase 2 (one additional GPU per step). A single task with all
|
||||
// selected GPUs is sufficient — spawning N tasks with growing subsets
|
||||
// would repeat all earlier steps redundantly.
|
||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
if len(resolved) < 2 {
|
||||
// Fall through to normal single-task path.
|
||||
rampUp = false
|
||||
} else {
|
||||
now := time.Now()
|
||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||
taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
|
||||
t := &Task{
|
||||
ID: newJobID("bee-bench-nvidia"),
|
||||
Name: taskName,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
GPUIndices: append([]int(nil), resolved...),
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: true,
|
||||
RampTotal: len(resolved),
|
||||
RampRunID: rampRunID,
|
||||
DisplayName: taskName,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeTaskRunResponse(w, []*Task{t})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// For non-ramp tasks append mode tag.
|
||||
if parallelGPUs {
|
||||
name = fmt.Sprintf("%s · parallel", name)
|
||||
} else {
|
||||
name = fmt.Sprintf("%s · sequential", name)
|
||||
}
|
||||
|
||||
params := taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: parallelGPUs,
|
||||
DisplayName: body.DisplayName,
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
BenchmarkKind string `json:"benchmark_kind"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
profile := strings.TrimSpace(body.Profile)
|
||||
if profile == "" {
|
||||
profile = "standard"
|
||||
}
|
||||
benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
|
||||
if benchmarkKind == "" {
|
||||
benchmarkKind = "power-fit"
|
||||
}
|
||||
now := time.Now()
|
||||
taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
|
||||
t := &Task{
|
||||
ID: newJobID("bee-bench-autotune"),
|
||||
Name: taskName,
|
||||
Target: "nvidia-bench-autotune",
|
||||
Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
BenchmarkProfile: profile,
|
||||
BenchmarkKind: benchmarkKind,
|
||||
SizeMB: body.SizeMB,
|
||||
DisplayName: taskName,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeTaskRunResponse(w, []*Task{t})
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
writeJSON(w, map[string]any{
|
||||
"configured": false,
|
||||
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
runNCCL := true
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
parallelGPUs := false
|
||||
if body.ParallelGPUs != nil {
|
||||
parallelGPUs = *body.ParallelGPUs
|
||||
}
|
||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: parallelGPUs,
|
||||
DisplayName: body.DisplayName,
|
||||
}, name, h.opts.App, "benchmark-nvidia")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
writeJSON(w, map[string]any{
|
||||
"configured": true,
|
||||
"config": cfg,
|
||||
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -610,6 +809,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
@@ -950,25 +1152,62 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
status := h.opts.App.LiveBootSource()
|
||||
status := h.currentRAMStatus()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(status)
|
||||
}
|
||||
|
||||
type ramStatusResponse struct {
|
||||
platform.LiveMediaRAMState
|
||||
InstallTaskActive bool `json:"install_task_active,omitempty"`
|
||||
CopyTaskActive bool `json:"copy_task_active,omitempty"`
|
||||
CanStartTask bool `json:"can_start_task,omitempty"`
|
||||
BlockedReason string `json:"blocked_reason,omitempty"`
|
||||
}
|
||||
|
||||
func (h *handler) currentRAMStatus() ramStatusResponse {
|
||||
state := h.opts.App.LiveMediaRAMState()
|
||||
resp := ramStatusResponse{LiveMediaRAMState: state}
|
||||
if globalQueue.hasActiveTarget("install") {
|
||||
resp.InstallTaskActive = true
|
||||
resp.BlockedReason = "install to disk is already running"
|
||||
return resp
|
||||
}
|
||||
if globalQueue.hasActiveTarget("install-to-ram") {
|
||||
resp.CopyTaskActive = true
|
||||
resp.BlockedReason = "install to RAM task is already pending or running"
|
||||
return resp
|
||||
}
|
||||
if state.InRAM {
|
||||
resp.BlockedReason = "system is already running from RAM"
|
||||
return resp
|
||||
}
|
||||
resp.CanStartTask = state.CanStartCopy
|
||||
if !resp.CanStartTask && resp.BlockedReason == "" {
|
||||
resp.BlockedReason = state.Message
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
if globalQueue.hasActiveTarget("install") {
|
||||
writeError(w, http.StatusConflict, "install to disk is already running")
|
||||
status := h.currentRAMStatus()
|
||||
if !status.CanStartTask {
|
||||
msg := strings.TrimSpace(status.BlockedReason)
|
||||
if msg == "" {
|
||||
msg = "install to RAM is not available"
|
||||
}
|
||||
writeError(w, http.StatusConflict, msg)
|
||||
return
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("install-to-ram"),
|
||||
Name: "Install to RAM",
|
||||
Target: "install-to-ram",
|
||||
Priority: 10,
|
||||
Priority: defaultTaskPriority("install-to-ram", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
@@ -1083,7 +1322,7 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
||||
ID: newJobID("install"),
|
||||
Name: "Install to Disk",
|
||||
Target: "install",
|
||||
Priority: 20,
|
||||
Priority: defaultTaskPriority("install", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
@@ -1359,6 +1598,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
|
||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||
}
|
||||
|
||||
func (h *handler) rollbackPendingNetworkChange() error {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
@@ -1375,4 +1619,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -39,6 +39,9 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||
t.Fatalf("burn profile=%q want smoke", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
@@ -61,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
@@ -75,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-benchmark" {
|
||||
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
|
||||
if task.Target != "nvidia-bench-perf" {
|
||||
t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
|
||||
}
|
||||
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||
@@ -84,6 +87,9 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
if task.params.RunNCCL {
|
||||
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
@@ -107,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
@@ -133,6 +139,94 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
|
||||
// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-power" {
|
||||
t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
if task.params.RampTotal != 3 {
|
||||
t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-autotune" {
|
||||
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
|
||||
}
|
||||
if task.params.BenchmarkKind != "power-fit" {
|
||||
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
@@ -175,6 +269,41 @@ func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||
got := []int{
|
||||
defaultTaskPriority("install-to-ram", taskParams{}),
|
||||
defaultTaskPriority("audit", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-perf", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-power", taskParams{}),
|
||||
}
|
||||
want := []int{
|
||||
taskPriorityInstallToRAM,
|
||||
taskPriorityAudit,
|
||||
taskPriorityValidate,
|
||||
taskPriorityValidateStress,
|
||||
taskPriorityBurn,
|
||||
taskPriorityBenchmark,
|
||||
taskPriorityBenchmark,
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
|
||||
t.Fatalf("priority order=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
|
||||
@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points (one per pixel) before building SVG.
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
statsLabel := chartStatsLabel(datasets)
|
||||
|
||||
legendItems := []metricChartSeries{}
|
||||
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points before building SVG.
|
||||
{
|
||||
datasets := make([][]float64, len(series))
|
||||
for i := range series {
|
||||
datasets[i] = series[i].Values
|
||||
}
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
for i := range series {
|
||||
series[i].Values = datasets[i]
|
||||
}
|
||||
}
|
||||
|
||||
scales := make([]chartScale, len(series))
|
||||
for i := range series {
|
||||
min, max := chartSeriesBounds(series[i].Values)
|
||||
@@ -445,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||
return out
|
||||
}
|
||||
|
||||
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||
// power charts where the filled area of each PSU shows its individual
|
||||
// contribution and the total height equals the combined draw.
|
||||
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range datasets {
|
||||
if len(datasets[i]) == 0 {
|
||||
datasets[i] = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
// Build cumulative sums per time point.
|
||||
cumulative := make([][]float64, len(datasets)+1)
|
||||
for i := range cumulative {
|
||||
cumulative[i] = make([]float64, pointCount)
|
||||
}
|
||||
for i, ds := range datasets {
|
||||
for j, v := range ds {
|
||||
cumulative[i+1][j] = cumulative[i][j] + v
|
||||
}
|
||||
}
|
||||
|
||||
// Scale is based on the total (top cumulative row).
|
||||
total := cumulative[len(cumulative)-1]
|
||||
yMin := floatPtr(0)
|
||||
if yMax == nil {
|
||||
yMax = autoMax120(total)
|
||||
}
|
||||
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||
|
||||
legendItems := make([]metricChartSeries, len(datasets))
|
||||
for i, name := range names {
|
||||
color := metricChartPalette[i%len(metricChartPalette)]
|
||||
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||
}
|
||||
|
||||
// Stats label from totals.
|
||||
statsLabel := chartStatsLabel([][]float64{total})
|
||||
|
||||
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
writeSingleAxisY(&b, layout, scale)
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
|
||||
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
// Draw border polylines on top.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
|
||||
writeLegend(&b, layout, legendItems)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||
// (baseline and top), using the given color at 55% opacity.
|
||||
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||
n := len(top)
|
||||
if n == 0 {
|
||||
return
|
||||
}
|
||||
if len(baseline) < n {
|
||||
baseline = make([]float64, n)
|
||||
}
|
||||
|
||||
// Forward path along top values, then backward along baseline values.
|
||||
var points strings.Builder
|
||||
for i := 0; i < n; i++ {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
if i > 0 {
|
||||
points.WriteByte(' ')
|
||||
}
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
for i := n - 1; i >= 0; i-- {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
points.WriteByte(' ')
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||
}
|
||||
|
||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||
}
|
||||
@@ -626,6 +764,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
// downsampleTimeSeries reduces the time series to at most maxPts points using
|
||||
// min-max bucketing. Each bucket contributes the index of its min and max value
|
||||
// (using the first full-length dataset as the reference series). All parallel
|
||||
// datasets are sampled at those same indices so all series stay aligned.
|
||||
// If len(times) <= maxPts the inputs are returned unchanged.
|
||||
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
|
||||
n := len(times)
|
||||
if n <= maxPts || maxPts <= 0 {
|
||||
return times, datasets
|
||||
}
|
||||
buckets := maxPts / 2
|
||||
if buckets < 1 {
|
||||
buckets = 1
|
||||
}
|
||||
// Use the first dataset that has the same length as times as the reference
|
||||
// for deciding which two indices to keep per bucket.
|
||||
var ref []float64
|
||||
for _, ds := range datasets {
|
||||
if len(ds) == n {
|
||||
ref = ds
|
||||
break
|
||||
}
|
||||
}
|
||||
selected := make([]int, 0, maxPts)
|
||||
bucketSize := float64(n) / float64(buckets)
|
||||
for b := 0; b < buckets; b++ {
|
||||
lo := int(math.Round(float64(b) * bucketSize))
|
||||
hi := int(math.Round(float64(b+1) * bucketSize))
|
||||
if hi > n {
|
||||
hi = n
|
||||
}
|
||||
if lo >= hi {
|
||||
continue
|
||||
}
|
||||
if ref == nil {
|
||||
selected = append(selected, lo)
|
||||
if hi-1 != lo {
|
||||
selected = append(selected, hi-1)
|
||||
}
|
||||
continue
|
||||
}
|
||||
minIdx, maxIdx := lo, lo
|
||||
for i := lo + 1; i < hi; i++ {
|
||||
if ref[i] < ref[minIdx] {
|
||||
minIdx = i
|
||||
}
|
||||
if ref[i] > ref[maxIdx] {
|
||||
maxIdx = i
|
||||
}
|
||||
}
|
||||
if minIdx <= maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
if maxIdx != minIdx {
|
||||
selected = append(selected, maxIdx)
|
||||
}
|
||||
} else {
|
||||
selected = append(selected, maxIdx)
|
||||
if minIdx != maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
}
|
||||
}
|
||||
}
|
||||
outTimes := make([]time.Time, len(selected))
|
||||
for i, idx := range selected {
|
||||
outTimes[i] = times[idx]
|
||||
}
|
||||
outDatasets := make([][]float64, len(datasets))
|
||||
for d, ds := range datasets {
|
||||
if len(ds) != n {
|
||||
outDatasets[d] = ds
|
||||
continue
|
||||
}
|
||||
out := make([]float64, len(selected))
|
||||
for i, idx := range selected {
|
||||
out[i] = ds[idx]
|
||||
}
|
||||
outDatasets[d] = out
|
||||
}
|
||||
return outTimes, outDatasets
|
||||
}
|
||||
|
||||
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||
if !end.After(start) {
|
||||
return float64(left+right) / 2
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -17,6 +20,25 @@ type jobState struct {
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logBuf *bufio.Writer
|
||||
}
|
||||
|
||||
// readTaskLogFile reads a task log, refusing files over 50 MB.
|
||||
func readTaskLogFile(path string) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(len(data)) > 50<<20 {
|
||||
return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// abort cancels the job if it has a cancel function and is not yet done.
|
||||
@@ -35,7 +57,7 @@ func (j *jobState) append(line string) {
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
appendJobLog(j.logPath, line)
|
||||
j.writeLogLineLocked(line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
@@ -48,6 +70,35 @@ func (j *jobState) append(line string) {
|
||||
}
|
||||
}
|
||||
|
||||
// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
|
||||
// Must be called with j.mu held. Uses a buffered writer kept open for the task
|
||||
// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
|
||||
func (j *jobState) writeLogLineLocked(line string) {
|
||||
if j.logFile == nil {
|
||||
f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
j.logFile = f
|
||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||
}
|
||||
_, _ = j.logBuf.WriteString(line + "\n")
|
||||
}
|
||||
|
||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||
func (j *jobState) closeLog() {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
if j.logBuf != nil {
|
||||
_ = j.logBuf.Flush()
|
||||
}
|
||||
if j.logFile != nil {
|
||||
_ = j.logFile.Close()
|
||||
j.logFile = nil
|
||||
j.logBuf = nil
|
||||
}
|
||||
}
|
||||
|
||||
func (j *jobState) finish(errMsg string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
@@ -119,7 +170,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||
if logPath == "" {
|
||||
return j
|
||||
}
|
||||
data, err := os.ReadFile(logPath)
|
||||
data, err := readTaskLogFile(logPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return j
|
||||
}
|
||||
|
||||
@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
|
||||
137
audit/internal/webui/layout.go
Normal file
137
audit/internal/webui/layout.go
Normal file
@@ -0,0 +1,137 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func layoutHead(title string) string {
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>` + html.EscapeString(title) + `</title>
|
||||
<style>
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||
*{box-sizing:border-box;margin:0;padding:0}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||
a{color:var(--accent);text-decoration:none}
|
||||
/* Sidebar */
|
||||
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
||||
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
|
||||
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
|
||||
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
|
||||
.nav{flex:1}
|
||||
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||
.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
|
||||
/* Content */
|
||||
.main{flex:1;display:flex;flex-direction:column;overflow:auto}
|
||||
.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
|
||||
.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
|
||||
.content{padding:24px;flex:1}
|
||||
/* Cards */
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
|
||||
.card-head-actions{justify-content:space-between}
|
||||
.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
|
||||
.card-body{padding:16px}
|
||||
/* Buttons */
|
||||
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
|
||||
.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
|
||||
.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
|
||||
.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
|
||||
.btn-sm{padding:5px 10px;font-size:12px}
|
||||
/* Tables */
|
||||
table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
|
||||
th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
|
||||
td{padding:9px 14px;border-top:1px solid var(--border-lite)}
|
||||
tr:first-child td{border-top:0}
|
||||
tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
/* Status badges */
|
||||
.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
|
||||
.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Component chips — one small square per device */
|
||||
.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
|
||||
.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
|
||||
.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||
/* Forms */
|
||||
.form-row{margin-bottom:14px}
|
||||
.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
|
||||
.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
|
||||
.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
|
||||
/* Grid */
|
||||
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
|
||||
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
|
||||
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
|
||||
/* iframe viewer */
|
||||
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
|
||||
/* Alerts */
|
||||
.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
|
||||
.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
|
||||
.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
`
|
||||
}
|
||||
|
||||
func layoutNav(active string, buildLabel string) string {
|
||||
items := []struct{ id, label, href, onclick string }{
|
||||
{"dashboard", "Dashboard", "/", ""},
|
||||
{"audit", "Audit", "/audit", ""},
|
||||
{"validate", "Validate", "/validate", ""},
|
||||
{"burn", "Burn", "/burn", ""},
|
||||
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||
{"tasks", "Tasks", "/tasks", ""},
|
||||
{"tools", "Tools", "/tools", ""},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||
if strings.TrimSpace(buildLabel) == "" {
|
||||
buildLabel = "dev"
|
||||
}
|
||||
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||
gspMode := strings.TrimSpace(string(raw))
|
||||
switch gspMode {
|
||||
case "gsp-off":
|
||||
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
|
||||
case "gsp-stuck":
|
||||
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
|
||||
}
|
||||
}
|
||||
b.WriteString(`<nav class="nav">`)
|
||||
for _, item := range items {
|
||||
cls := "nav-item"
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
if item.onclick != "" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||
cls, item.href, item.onclick, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||
cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
}
|
||||
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
power_source TEXT,
|
||||
power_mode TEXT,
|
||||
power_reason TEXT,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
|
||||
}
|
||||
|
||||
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -161,14 +173,64 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// Downsample reduces density of old metrics rows to 1 sample per minute.
|
||||
// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
|
||||
// affected — rows newer than downsampleBefore keep full 5-second resolution.
|
||||
// For each 60-second bucket the row with the smallest ts is kept; the rest
|
||||
// are deleted. This trims ~92 % of rows in that window while preserving
|
||||
// the overall shape of every chart.
|
||||
//
|
||||
// Called hourly by the metrics collector background goroutine.
|
||||
func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
start := deleteOlderThan.Unix()
|
||||
end := downsampleBefore.Unix()
|
||||
if end <= start {
|
||||
return nil
|
||||
}
|
||||
// For each table: delete rows in [start, end) whose ts is NOT the minimum
|
||||
// ts in its 60-second bucket (ts/60 integer division = bucket ID).
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
_, err := m.db.Exec(`
|
||||
DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
|
||||
AND ts NOT IN (
|
||||
SELECT MIN(ts) FROM `+table+`
|
||||
WHERE ts >= ? AND ts < ?
|
||||
GROUP BY ts / 60
|
||||
)`, start, end, start, end)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Prune deletes all rows older than the given cutoff from every metrics table.
|
||||
// Called hourly by the metrics collector to keep the DB size bounded.
|
||||
func (m *MetricsDB) Prune(before time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
cutTS := before.Unix()
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// LoadBetween returns samples in chronological order within the given time window.
|
||||
@@ -183,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
|
||||
start, end = end, start
|
||||
}
|
||||
return m.loadSamples(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
start.Unix(), end.Unix(),
|
||||
)
|
||||
}
|
||||
@@ -199,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
powerSource string
|
||||
powerMode string
|
||||
powerReason string
|
||||
}
|
||||
var sysRows []sysRow
|
||||
for rows.Next() {
|
||||
var r sysRow
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
|
||||
continue
|
||||
}
|
||||
sysRows = append(sysRows, r)
|
||||
@@ -313,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
s := platform.LiveMetricSample{
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
PowerSource: r.powerSource,
|
||||
PowerMode: r.powerMode,
|
||||
PowerReason: r.powerReason,
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||
|
||||
613
audit/internal/webui/page_benchmark.go
Normal file
613
audit/internal/webui/page_benchmark.go
Normal file
@@ -0,0 +1,613 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
type benchmarkHistoryRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
gpuScores map[int]float64
|
||||
gpuStatuses map[int]string
|
||||
overallStatus string
|
||||
}
|
||||
|
||||
func renderBenchmark(opts HandlerOptions) string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="grid2">
|
||||
<div class="card">
|
||||
<div class="card-head">Benchmark Setup</div>
|
||||
<div class="card-body">
|
||||
<div class="form-row">
|
||||
<label>Profile</label>
|
||||
<select id="benchmark-profile">
|
||||
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||||
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||||
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label>GPU Selection</label>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
</div>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Sequential — one GPU at a time</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row" id="benchmark-parallel-label">
|
||||
<input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Parallel — all selected GPUs simultaneously</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row" id="benchmark-ramp-label">
|
||||
<input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
|
||||
</label>
|
||||
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
|
||||
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>▶ Run Performance Benchmark</button>
|
||||
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>▶ Run Power / Thermal Fit</button>
|
||||
<button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
|
||||
</div>
|
||||
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||||
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
||||
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Method Split</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||||
<table>
|
||||
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||||
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||||
<tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||||
</table>
|
||||
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
|
||||
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let benchmarkES = null;
|
||||
function benchmarkTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
function benchmarkSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function benchmarkMode() {
|
||||
const el = document.querySelector('input[name="benchmark-mode"]:checked');
|
||||
return el ? el.value : 'sequential';
|
||||
}
|
||||
function benchmarkUpdateSelectionNote() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const perfBtn = document.getElementById('benchmark-run-performance-btn');
|
||||
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
|
||||
const note = document.getElementById('benchmark-selection-note');
|
||||
if (!selected.length) {
|
||||
perfBtn.disabled = true;
|
||||
fitBtn.disabled = true;
|
||||
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||||
return;
|
||||
}
|
||||
perfBtn.disabled = false;
|
||||
fitBtn.disabled = false;
|
||||
const mode = benchmarkMode();
|
||||
if (mode === 'ramp-up') {
|
||||
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
|
||||
} else if (mode === 'parallel') {
|
||||
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
|
||||
} else {
|
||||
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
|
||||
}
|
||||
}
|
||||
function benchmarkRenderGPUList(gpus) {
|
||||
const root = document.getElementById('benchmark-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="benchmark-gpu-row">'
|
||||
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
benchmarkApplyMultiGPUState(gpus.length);
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkApplyMultiGPUState(gpuCount) {
|
||||
var multiValues = ['parallel', 'ramp-up'];
|
||||
var radios = document.querySelectorAll('input[name="benchmark-mode"]');
|
||||
radios.forEach(function(el) {
|
||||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||
if (gpuCount < 2 && isMulti) {
|
||||
el.disabled = true;
|
||||
if (el.checked) {
|
||||
var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
|
||||
if (seq) seq.checked = true;
|
||||
}
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '0.4';
|
||||
} else {
|
||||
el.disabled = false;
|
||||
if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '';
|
||||
}
|
||||
});
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkLoadGPUs() {
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
status.textContent = '';
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
benchmarkRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
function benchmarkSelectAll() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkSelectNone() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function runNvidiaBenchmark(kind) {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
if (!selected.length) {
|
||||
status.textContent = 'Select at least one GPU.';
|
||||
return;
|
||||
}
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
const mode = benchmarkMode();
|
||||
const rampUp = mode === 'ramp-up' && selected.length > 1;
|
||||
const parallelGPUs = mode === 'parallel' && kind === 'performance';
|
||||
if (kind === 'power-fit' && mode === 'parallel') {
|
||||
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
|
||||
return;
|
||||
}
|
||||
const body = {
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
gpu_indices: selected,
|
||||
run_nccl: kind === 'performance' && selected.length > 1,
|
||||
parallel_gpus: parallelGPUs,
|
||||
ramp_up: rampUp,
|
||||
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
|
||||
};
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
|
||||
status.textContent = 'Queueing...';
|
||||
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
|
||||
fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||
return;
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
if (e.data) failures += 1;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
const isLast = (idx + 1 >= taskIds.length);
|
||||
streamNext(idx + 1, failures);
|
||||
if (isLast) { benchmarkRefreshResults(); }
|
||||
});
|
||||
benchmarkES.onerror = function() {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
streamNext(idx + 1, failures + 1);
|
||||
};
|
||||
};
|
||||
streamNext(0, 0);
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
function benchmarkRenderAutotuneStatus(payload) {
|
||||
const el = document.getElementById('benchmark-autotune-status');
|
||||
if (!el) return;
|
||||
if (!payload || !payload.configured || !payload.config) {
|
||||
el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
|
||||
return;
|
||||
}
|
||||
const cfg = payload.config || {};
|
||||
const decision = payload.decision || {};
|
||||
const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
|
||||
const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
|
||||
const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
|
||||
const mode = decision.mode ? (' · mode ' + decision.mode) : '';
|
||||
el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
|
||||
}
|
||||
function loadBenchmarkAutotuneStatus() {
|
||||
fetch('/api/bee-bench/nvidia/autotune/status')
|
||||
.then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
})
|
||||
.then(function(body) { benchmarkRenderAutotuneStatus(body); })
|
||||
.catch(function(err) {
|
||||
const el = document.getElementById('benchmark-autotune-status');
|
||||
if (el) el.textContent = 'Autotune status error: ' + err.message;
|
||||
});
|
||||
}
|
||||
function runBenchmarkAutotune() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
|
||||
term.textContent = 'Enqueuing benchmark autotune...\n';
|
||||
status.textContent = 'Queueing autotune...';
|
||||
fetch('/api/bee-bench/nvidia/autotune/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify({
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
|
||||
gpu_indices: selected
|
||||
})
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No autotune task was queued.');
|
||||
const taskId = taskIds[0];
|
||||
status.textContent = 'Autotune queued: ' + taskId;
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
|
||||
loadBenchmarkAutotuneStatus();
|
||||
});
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Autotune error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
benchmarkLoadGPUs();
|
||||
loadBenchmarkAutotuneStatus();
|
||||
function benchmarkRefreshResults() {
|
||||
fetch('/api/benchmark/results')
|
||||
.then(function(r) { return r.text(); })
|
||||
.then(function(html) {
|
||||
const el = document.getElementById('benchmark-results-section');
|
||||
if (el) el.innerHTML = html;
|
||||
})
|
||||
.catch(function() {});
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCard(exportDir string) string {
|
||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||
perf := renderBenchmarkResultsCardFromRuns(
|
||||
"Perf Results",
|
||||
"Composite score by saved benchmark run and GPU.",
|
||||
"No saved performance benchmark runs yet.",
|
||||
maxIdx,
|
||||
runs,
|
||||
)
|
||||
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||
return perf + "\n" + power
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||
if len(runs) == 0 {
|
||||
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<div style="overflow-x:auto">`)
|
||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||
for i := 0; i <= maxGPUIndex; i++ {
|
||||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||
}
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
overallColor := "var(--ok)"
|
||||
overallLabel := run.overallStatus
|
||||
if overallLabel == "" {
|
||||
overallLabel = "OK"
|
||||
}
|
||||
if overallLabel == "FAILED" {
|
||||
overallColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if overallLabel != "OK" {
|
||||
overallColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||
score, ok := run.gpuScores[idx]
|
||||
if !ok {
|
||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||
continue
|
||||
}
|
||||
gpuStatus := run.gpuStatuses[idx]
|
||||
scoreColor := ""
|
||||
switch gpuStatus {
|
||||
case "FAILED":
|
||||
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||
case "WARNING", "PARTIAL":
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
case "", "OK":
|
||||
default:
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
}
|
||||
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||
}
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
|
||||
baseDir := app.DefaultBeeBenchPerfDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return -1, nil
|
||||
}
|
||||
sort.Strings(paths)
|
||||
return loadBenchmarkHistoryFromPaths(paths)
|
||||
}
|
||||
|
||||
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
|
||||
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||
maxGPUIndex := -1
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var result platform.NvidiaBenchmarkResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
continue
|
||||
}
|
||||
run := benchmarkHistoryRun{
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
gpuScores: make(map[int]float64),
|
||||
gpuStatuses: make(map[int]string),
|
||||
overallStatus: result.OverallStatus,
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||
if gpu.Index > maxGPUIndex {
|
||||
maxGPUIndex = gpu.Index
|
||||
}
|
||||
}
|
||||
runs = append(runs, run)
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
return maxGPUIndex, runs
|
||||
}
|
||||
|
||||
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
baseDir := app.DefaultBeeBenchPowerDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
type powerRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
result platform.NvidiaPowerBenchResult
|
||||
}
|
||||
var runs []powerRun
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var r platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &r); err != nil {
|
||||
continue
|
||||
}
|
||||
runs = append(runs, powerRun{
|
||||
generatedAt: r.GeneratedAt,
|
||||
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
result: r,
|
||||
})
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||
|
||||
latest := runs[0].result
|
||||
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||
if latest.Hostname != "" {
|
||||
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||
}
|
||||
if latest.OverallStatus != "" {
|
||||
statusColor := "var(--ok)"
|
||||
if latest.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||
}
|
||||
b.WriteString(`</p>`)
|
||||
|
||||
if len(latest.GPUs) > 0 {
|
||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for _, gpu := range latest.GPUs {
|
||||
finalLimitW := gpu.StablePowerLimitW
|
||||
if finalLimitW <= 0 {
|
||||
finalLimitW = gpu.AppliedPowerLimitW
|
||||
}
|
||||
derated := gpu.Derated ||
|
||||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||
rowStyle := ""
|
||||
finalStyle := ""
|
||||
if derated {
|
||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||
}
|
||||
statusLabel := gpu.Status
|
||||
if statusLabel == "" {
|
||||
statusLabel = "OK"
|
||||
}
|
||||
statusColor := "var(--ok)"
|
||||
if statusLabel == "FAILED" {
|
||||
statusColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if statusLabel != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
nominalStr := "-"
|
||||
if gpu.DefaultPowerLimitW > 0 {
|
||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||
}
|
||||
singleStr := "-"
|
||||
if gpu.AppliedPowerLimitW > 0 {
|
||||
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||
}
|
||||
multiStr := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||
}
|
||||
p95Str := "-"
|
||||
if gpu.MaxObservedPowerW > 0 {
|
||||
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||
}
|
||||
b.WriteString(`<tr` + rowStyle + `>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div>`)
|
||||
}
|
||||
|
||||
if len(runs) > 1 {
|
||||
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
statusColor := "var(--ok)"
|
||||
if run.result.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></details>`)
|
||||
}
|
||||
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
383
audit/internal/webui/page_burn.go
Normal file
383
audit/internal/webui/page_burn.go
Normal file
@@ -0,0 +1,383 @@
|
||||
package webui
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Burn Profile</div>
|
||||
<div class="card-body burn-profile-body">
|
||||
<div class="burn-profile-col">
|
||||
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
<div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
|
||||
<label class="cb-row">
|
||||
<input type="radio" name="burn-nvidia-mode" value="sequential" checked>
|
||||
<span>Sequential — selected GPUs one at a time</span>
|
||||
</label>
|
||||
<label class="cb-row" id="burn-parallel-label">
|
||||
<input type="radio" name="burn-nvidia-mode" value="parallel">
|
||||
<span>Parallel — all selected GPUs simultaneously</span>
|
||||
</label>
|
||||
<label class="cb-row" id="burn-ramp-label">
|
||||
<input type="radio" name="burn-nvidia-mode" value="ramp-up">
|
||||
<span>Ramp-up — add one GPU at a time</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">Core Burn Paths</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||
.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.burn-profile-col { min-width:0; }
|
||||
.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
|
||||
.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
|
||||
.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
|
||||
.burn-grid { align-items:stretch; }
|
||||
.burn-card { height:100%; display:flex; flex-direction:column; }
|
||||
.burn-card-body { flex:1; display:flex; flex-direction:column; }
|
||||
.card-head-actions { justify-content:space-between; }
|
||||
.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
|
||||
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let biES = null;
|
||||
function burnTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
function burnProfile() {
|
||||
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||
return selected ? selected.value : 'smoke';
|
||||
}
|
||||
function burnSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function burnNvidiaMode() {
|
||||
const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
|
||||
return el ? el.value : 'sequential';
|
||||
}
|
||||
function burnApplyMultiGPUState(gpuCount) {
|
||||
var multiValues = ['parallel', 'ramp-up'];
|
||||
var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
|
||||
radios.forEach(function(el) {
|
||||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||
if (gpuCount < 2 && isMulti) {
|
||||
el.disabled = true;
|
||||
if (el.checked) {
|
||||
var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
|
||||
if (seq) seq.checked = true;
|
||||
}
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '0.4';
|
||||
} else {
|
||||
el.disabled = false;
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function burnUpdateSelectionNote() {
|
||||
const note = document.getElementById('burn-selection-note');
|
||||
const selected = burnSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
|
||||
}
|
||||
function burnRenderGPUList(gpus) {
|
||||
const root = document.getElementById('burn-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
burnUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="burn-gpu-row">'
|
||||
+ '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
burnApplyMultiGPUState(gpus.length);
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnSelectAll() {
|
||||
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnSelectNone() {
|
||||
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnLoadGPUs() {
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
burnRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
burnUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||
const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
|
||||
if (useSelectedNvidia) {
|
||||
const selected = burnSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
body.gpu_indices = selected;
|
||||
const bMode = burnNvidiaMode();
|
||||
if (bMode === 'ramp-up' && selected.length > 1) {
|
||||
body.stagger_gpu_start = true;
|
||||
} else if (bMode === 'parallel' && selected.length > 1) {
|
||||
body.parallel_gpus = true;
|
||||
}
|
||||
}
|
||||
return fetch('/api/sat/' + target + '/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
});
|
||||
}
|
||||
function streamTask(taskId, label) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
});
|
||||
}
|
||||
function streamBurnTask(taskId, label, resetTerminal) {
|
||||
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||
}
|
||||
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||
term.textContent += 'ERROR: no tasks queued.\n';
|
||||
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||
}
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||
return new Promise(function(resolve) {
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + (e.data ? 1 : 0));
|
||||
});
|
||||
biES.onerror = function() {
|
||||
if (biES) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + 1);
|
||||
};
|
||||
}).then(function(nextFailures) {
|
||||
return streamNext(idx + 1, nextFailures);
|
||||
});
|
||||
};
|
||||
return streamNext(0, 0);
|
||||
}
|
||||
function runBurnTaskSet(tasks, statusElId) {
|
||||
const enabled = tasks.filter(function(t) {
|
||||
const el = document.getElementById(t.id);
|
||||
return el && el.checked && !el.disabled;
|
||||
});
|
||||
const status = statusElId ? document.getElementById(statusElId) : null;
|
||||
if (status) status.textContent = '';
|
||||
if (!enabled.length) {
|
||||
if (status) status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const term = document.getElementById('bi-terminal');
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
|
||||
term.textContent = '';
|
||||
const runNext = function(idx) {
|
||||
if (idx >= enabled.length) {
|
||||
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
|
||||
return Promise.resolve();
|
||||
}
|
||||
const t = enabled[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
|
||||
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||
.then(function(d) {
|
||||
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||
})
|
||||
.then(function() {
|
||||
return runNext(idx + 1);
|
||||
})
|
||||
.catch(function(err) {
|
||||
if (status) status.textContent = 'Error: ' + err.message;
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
return Promise.reject(err);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runPlatformStress() {
|
||||
const comps = [];
|
||||
const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
|
||||
const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
|
||||
const hasChecked = function(ids) {
|
||||
return ids.some(function(id) {
|
||||
const el = document.getElementById(id);
|
||||
return el && el.checked && !el.disabled;
|
||||
});
|
||||
};
|
||||
if (hasChecked(computeIDs)) comps.push('cpu');
|
||||
if (hasChecked(gpuIDs)) comps.push('gpu');
|
||||
if (!comps.length) {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
|
||||
return;
|
||||
}
|
||||
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||
enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
|
||||
streamTask(d.task_id, 'Platform Thermal Cycling');
|
||||
});
|
||||
}
|
||||
function runAllBurnTasks() {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
const all = [
|
||||
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||
{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
|
||||
{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
|
||||
{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
|
||||
];
|
||||
status.textContent = 'Enqueuing...';
|
||||
runBurnTaskSet(all, 'burn-all-status');
|
||||
}
|
||||
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||
const map = {
|
||||
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||
};
|
||||
tools.forEach(function(t) {
|
||||
const spec = map[t.id];
|
||||
if (!spec) return;
|
||||
const cb = document.getElementById(spec.cb);
|
||||
const note = document.getElementById(spec.note);
|
||||
if (!cb) return;
|
||||
if (t.available) {
|
||||
cb.disabled = false;
|
||||
} else if (note) {
|
||||
note.textContent = '— ' + spec.reason;
|
||||
}
|
||||
});
|
||||
}).catch(function() {});
|
||||
burnLoadGPUs();
|
||||
</script>`
|
||||
}
|
||||
434
audit/internal/webui/page_export_tools.go
Normal file
434
audit/internal/webui/page_export_tools.go
Normal file
@@ -0,0 +1,434 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func renderExport(exportDir string) string {
|
||||
entries, _ := listExportFiles(exportDir)
|
||||
var rows strings.Builder
|
||||
for _, e := range entries {
|
||||
rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
|
||||
url.QueryEscape(e), html.EscapeString(e)))
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
|
||||
}
|
||||
return `<div class="grid2">
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||
</div></div>
|
||||
</div>
|
||||
|
||||
` + renderUSBExportCard()
|
||||
}
|
||||
|
||||
func listExportFiles(exportDir string) ([]string, error) {
|
||||
var entries []string
|
||||
err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
rel, err := filepath.Rel(exportDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries = append(entries, rel)
|
||||
return nil
|
||||
})
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
sort.Strings(entries)
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
func renderSupportBundleInline() string {
|
||||
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">↓ Download Support Bundle</button>
|
||||
<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
|
||||
<script>
|
||||
window.supportBundleDownload = function() {
|
||||
var btn = document.getElementById('support-bundle-btn');
|
||||
var status = document.getElementById('support-bundle-status');
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Building...';
|
||||
status.textContent = 'Collecting logs and export data\u2026';
|
||||
status.style.color = 'var(--muted)';
|
||||
var filename = 'bee-support.tar.gz';
|
||||
fetch('/export/support.tar.gz')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('HTTP ' + r.status);
|
||||
var cd = r.headers.get('Content-Disposition') || '';
|
||||
var m = cd.match(/filename="?([^";]+)"?/);
|
||||
if (m) filename = m[1];
|
||||
return r.blob();
|
||||
})
|
||||
.then(function(blob) {
|
||||
var url = URL.createObjectURL(blob);
|
||||
var a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
status.textContent = 'Download started.';
|
||||
status.style.color = 'var(--ok-fg)';
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
})
|
||||
.finally(function() {
|
||||
btn.disabled = false;
|
||||
btn.textContent = '\u2195 Download Support Bundle';
|
||||
});
|
||||
};
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderUSBExportCard() string {
|
||||
return `<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">Export to USB
|
||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||
</div>`
|
||||
}
|
||||
|
||||
func renderUSBExportInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function usbRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||
window._usbTargets = Array.isArray(targets) ? targets : [];
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
return;
|
||||
}
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||
targets.map((t, idx) => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
|
||||
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
}).catch(e => {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.usbExport = function(type, targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
if (!target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: USB target not found. Refresh and try again.';
|
||||
return;
|
||||
}
|
||||
const msg = document.getElementById('usb-msg');
|
||||
const row = btn ? btn.closest('td') : null;
|
||||
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Exporting...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
|
||||
fetch('/api/export/usb/'+type, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
}).then(async r => {
|
||||
const d = await r.json();
|
||||
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||
return d;
|
||||
}).then(d => {
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--ok,green)';
|
||||
rowMsg.textContent = d.message || 'Done.';
|
||||
}
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--err,red)';
|
||||
rowMsg.textContent = 'Error: ' + e;
|
||||
}
|
||||
}).finally(() => {
|
||||
if (btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
});
|
||||
};
|
||||
window.usbRefresh = usbRefresh;
|
||||
usbRefresh();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNvidiaSelfHealInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
|
||||
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
|
||||
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">↻ Refresh</button>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
|
||||
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function nvidiaSelfHealShowResult(label, status, output) {
|
||||
var out = document.getElementById('nvidia-self-heal-out');
|
||||
var term = document.getElementById('nvidia-self-heal-terminal');
|
||||
var statusEl = document.getElementById('nvidia-self-heal-out-status');
|
||||
var labelEl = document.getElementById('nvidia-self-heal-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = label;
|
||||
term.textContent = output || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
}
|
||||
function nvidiaRestartDrivers() {
|
||||
var btn = document.getElementById('nvidia-restart-btn');
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Restarting...';
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
|
||||
fetch('/api/services/action', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
|
||||
setTimeout(function() {
|
||||
loadServices();
|
||||
loadNvidiaSelfHeal();
|
||||
}, 800);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function nvidiaResetGPU(index, btn) {
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Resetting...';
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
|
||||
fetch('/api/gpu/nvidia-reset', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({index:index})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
|
||||
setTimeout(loadNvidiaSelfHeal, 1000);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function loadNvidiaSelfHeal() {
|
||||
var status = document.getElementById('nvidia-self-heal-status');
|
||||
var table = document.getElementById('nvidia-self-heal-table');
|
||||
status.textContent = 'Loading NVIDIA GPU status...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
|
||||
if (!Array.isArray(gpus) || gpus.length === 0) {
|
||||
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
|
||||
const rows = gpus.map(g => {
|
||||
const serial = g.serial || '';
|
||||
const bdf = g.bdf || '';
|
||||
const id = serial || bdf || ('gpu-' + g.index);
|
||||
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
|
||||
const details = [];
|
||||
if (serial) details.push('serial ' + serial);
|
||||
if (bdf) details.push('bdf ' + bdf);
|
||||
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
|
||||
return '<tr>'
|
||||
+ '<td style="white-space:nowrap">' + g.index + '</td>'
|
||||
+ '<td>' + (g.name || 'unknown') + '</td>'
|
||||
+ '<td style="font-family:monospace">' + id + '</td>'
|
||||
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
|
||||
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
|
||||
+ '</td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(e => {
|
||||
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
|
||||
status.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
loadNvidiaSelfHeal();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderTools() string {
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.message || 'Checking...';
|
||||
if (d.status === 'ok' || d.in_ram) {
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
} else if (d.status === 'failed') {
|
||||
txt.style.color = 'var(--err, #b91c1c)';
|
||||
} else {
|
||||
txt.style.color = 'var(--muted)';
|
||||
}
|
||||
if (d.can_start_task) {
|
||||
btn.style.display = '';
|
||||
btn.disabled = false;
|
||||
} else {
|
||||
btn.style.display = 'none';
|
||||
}
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML =
|
||||
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
checkTools();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderExportIndex(exportDir string) (string, error) {
|
||||
entries, err := listExportFiles(exportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var body strings.Builder
|
||||
body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
|
||||
body.WriteString(`<h1>Bee Export Files</h1><ul>`)
|
||||
for _, entry := range entries {
|
||||
body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
body.WriteString(`<li>No export files found.</li>`)
|
||||
}
|
||||
body.WriteString(`</ul></body></html>`)
|
||||
return body.String(), nil
|
||||
}
|
||||
314
audit/internal/webui/page_install_tasks.go
Normal file
314
audit/internal/webui/page_install_tasks.go
Normal file
@@ -0,0 +1,314 @@
|
||||
package webui
|
||||
|
||||
func renderInstallInline() string {
|
||||
return `
|
||||
<div class="alert alert-warn" style="margin-bottom:16px">
|
||||
<strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
|
||||
disk and write the live system onto it. All existing data on the target disk will be lost.
|
||||
This operation cannot be undone.
|
||||
</div>
|
||||
<div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
|
||||
<div id="install-disk-section" style="display:none">
|
||||
<div class="card" style="margin-bottom:0">
|
||||
<table id="install-disk-table">
|
||||
<thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
|
||||
<tbody id="install-disk-tbody"></tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div style="margin-top:12px">
|
||||
<button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
|
||||
</div>
|
||||
</div>
|
||||
<div id="install-confirm-section" style="display:none;margin-top:20px">
|
||||
<div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
|
||||
<div class="form-row" style="max-width:360px">
|
||||
<label>Type the device name to confirm (e.g. /dev/sda)</label>
|
||||
<input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
|
||||
</div>
|
||||
<button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
|
||||
<button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
|
||||
</div>
|
||||
<div id="install-progress-section" style="display:none;margin-top:20px">
|
||||
<div class="card-head" style="margin-bottom:8px">Installation Progress</div>
|
||||
<div id="install-terminal" class="terminal" style="max-height:500px"></div>
|
||||
<div id="install-status" style="margin-top:12px;font-size:13px"></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
#install-disk-tbody tr{cursor:pointer}
|
||||
#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
|
||||
#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
|
||||
</style>
|
||||
|
||||
<script>
|
||||
var _installSelected = null;
|
||||
|
||||
function installRefreshDisks() {
|
||||
document.getElementById('install-loading').style.display = '';
|
||||
document.getElementById('install-disk-section').style.display = 'none';
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
_installSelected = null;
|
||||
fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
|
||||
document.getElementById('install-loading').style.display = 'none';
|
||||
var tbody = document.getElementById('install-disk-tbody');
|
||||
tbody.innerHTML = '';
|
||||
if (!disks || disks.length === 0) {
|
||||
tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
|
||||
} else {
|
||||
disks.forEach(function(d) {
|
||||
var warnings = (d.warnings || []);
|
||||
var statusHtml;
|
||||
if (warnings.length === 0) {
|
||||
statusHtml = '<span class="badge badge-ok">OK</span>';
|
||||
} else {
|
||||
var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
|
||||
statusHtml = warnings.map(function(w){
|
||||
var cls = hasSmall ? 'badge-err' : 'badge-warn';
|
||||
return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'"') + '">' +
|
||||
(w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
|
||||
}).join(' ');
|
||||
}
|
||||
var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
|
||||
? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
|
||||
var tr = document.createElement('tr');
|
||||
tr.dataset.device = d.device;
|
||||
tr.dataset.model = d.model || 'Unknown';
|
||||
tr.dataset.size = d.size;
|
||||
tr.dataset.warnings = JSON.stringify(warnings);
|
||||
tr.innerHTML =
|
||||
'<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
|
||||
'<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
|
||||
'<td>' + (d.model || '—') + '</td>' +
|
||||
'<td>' + d.size + '</td>' +
|
||||
'<td>' + statusHtml + '</td>';
|
||||
tr.addEventListener('click', function(){ installSelectDisk(this); });
|
||||
tbody.appendChild(tr);
|
||||
});
|
||||
}
|
||||
document.getElementById('install-disk-section').style.display = '';
|
||||
}).catch(function(e){
|
||||
document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
|
||||
});
|
||||
}
|
||||
|
||||
function installSelectDisk(tr) {
|
||||
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||
tr.classList.add('selected');
|
||||
var radio = tr.querySelector('input[type=radio]');
|
||||
if (radio) radio.checked = true;
|
||||
_installSelected = {
|
||||
device: tr.dataset.device,
|
||||
model: tr.dataset.model,
|
||||
size: tr.dataset.size,
|
||||
warnings: JSON.parse(tr.dataset.warnings || '[]')
|
||||
};
|
||||
var warnBox = document.getElementById('install-confirm-warn');
|
||||
var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
|
||||
' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
|
||||
' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
|
||||
if (_installSelected.warnings.length > 0) {
|
||||
warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
|
||||
}
|
||||
warnBox.innerHTML = warnLines;
|
||||
document.getElementById('install-confirm-input').value = '';
|
||||
document.getElementById('install-start-btn').disabled = true;
|
||||
document.getElementById('install-confirm-section').style.display = '';
|
||||
document.getElementById('install-progress-section').style.display = 'none';
|
||||
}
|
||||
|
||||
function installDeselect() {
|
||||
_installSelected = null;
|
||||
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||
document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
}
|
||||
|
||||
function installCheckConfirm() {
|
||||
var val = document.getElementById('install-confirm-input').value.trim();
|
||||
var ok = _installSelected && val === _installSelected.device;
|
||||
document.getElementById('install-start-btn').disabled = !ok;
|
||||
}
|
||||
|
||||
function installStart() {
|
||||
if (!_installSelected) return;
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
document.getElementById('install-disk-section').style.display = 'none';
|
||||
document.getElementById('install-loading').style.display = 'none';
|
||||
var prog = document.getElementById('install-progress-section');
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
prog.style.display = '';
|
||||
term.textContent = '';
|
||||
status.textContent = 'Starting installation…';
|
||||
status.style.color = 'var(--muted)';
|
||||
|
||||
fetch('/api/install/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({device: _installSelected.device})
|
||||
}).then(function(r){
|
||||
return r.json().then(function(j){
|
||||
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||
return j;
|
||||
});
|
||||
}).then(function(j){
|
||||
if (!j.task_id) throw new Error('missing task id');
|
||||
installStreamLog(j.task_id);
|
||||
}).catch(function(e){
|
||||
status.textContent = 'Error: ' + e;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
}
|
||||
|
||||
function installStreamLog(taskId) {
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
var es = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
es.onmessage = function(e) {
|
||||
term.textContent += e.data + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
};
|
||||
es.addEventListener('done', function(e) {
|
||||
es.close();
|
||||
if (!e.data) {
|
||||
status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
|
||||
var rebootBtn = document.createElement('button');
|
||||
rebootBtn.className = 'btn btn-primary btn-sm';
|
||||
rebootBtn.style.marginLeft = '12px';
|
||||
rebootBtn.textContent = 'Reboot now';
|
||||
rebootBtn.onclick = function(){
|
||||
fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({name:'', action:'reboot'})});
|
||||
};
|
||||
status.appendChild(rebootBtn);
|
||||
} else {
|
||||
status.textContent = '✗ Installation failed: ' + e.data;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
}
|
||||
});
|
||||
es.onerror = function() {
|
||||
es.close();
|
||||
status.textContent = '✗ Stream disconnected.';
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
};
|
||||
}
|
||||
|
||||
installRefreshDisks();
|
||||
</script>
|
||||
`
|
||||
}
|
||||
|
||||
func renderInstall() string {
|
||||
return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
|
||||
renderInstallInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||
</div>
|
||||
<script>
|
||||
var _taskRefreshTimer = null;
|
||||
var _tasksAll = [];
|
||||
var _taskPage = 1;
|
||||
var _taskPageSize = 50;
|
||||
|
||||
function loadTasks() {
|
||||
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||
if (_tasksAll.length === 0) {
|
||||
_taskPage = 1;
|
||||
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||
return;
|
||||
}
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||
if (_taskPage < 1) _taskPage = 1;
|
||||
const start = (_taskPage - 1) * _taskPageSize;
|
||||
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||
const rows = pageTasks.map(t => {
|
||||
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||
let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
|
||||
if (t.status === 'running' || t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
|
||||
}
|
||||
if (t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">⇧</button>';
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">⇩</button>';
|
||||
}
|
||||
return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
|
||||
'<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
|
||||
'<td>'+t.priority+'</td>' +
|
||||
'<td>'+actions+'</td></tr>';
|
||||
}).join('');
|
||||
const showingFrom = start + 1;
|
||||
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||
const pager =
|
||||
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||
'</div>' +
|
||||
'</div>';
|
||||
document.getElementById('tasks-table').innerHTML =
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||
});
|
||||
}
|
||||
|
||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||
function formatDurSec(sec) {
|
||||
sec = Math.max(0, Math.round(sec||0));
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
}
|
||||
function setTaskPage(page) {
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||
loadTasks();
|
||||
}
|
||||
|
||||
function cancelTask(id) {
|
||||
fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function killWorkers() {
|
||||
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||
.then(r=>r.json())
|
||||
.then(d=>{
|
||||
loadTasks();
|
||||
var toast = document.getElementById('kill-toast');
|
||||
var parts = [];
|
||||
if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
|
||||
if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
|
||||
toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
|
||||
toast.style.display = '';
|
||||
setTimeout(()=>{ toast.style.display='none'; }, 5000);
|
||||
});
|
||||
}
|
||||
function setPriority(id, delta) {
|
||||
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||
.then(()=>loadTasks());
|
||||
}
|
||||
|
||||
loadTasks();
|
||||
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||
</script>`
|
||||
}
|
||||
238
audit/internal/webui/page_metrics.go
Normal file
238
audit/internal/webui/page_metrics.go
Normal file
@@ -0,0 +1,238 @@
|
||||
package webui
|
||||
|
||||
func renderMetrics() string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Temperature — CPU</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Temperature — Ambient Sensors</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
|
||||
<div class="card-head">Server — Fan RPM</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
|
||||
<div>
|
||||
<div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
|
||||
<div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
|
||||
</div>
|
||||
<label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
|
||||
<input id="gpu-chart-toggle" type="checkbox">
|
||||
<span>One chart per GPU</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div id="gpu-metrics-by-metric">
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Compute Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Memory Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Core Clock</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Temperature</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="gpu-metrics-by-gpu" style="display:none"></div>
|
||||
</section>
|
||||
|
||||
<script>
|
||||
let gpuChartKey = '';
|
||||
const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
|
||||
let metricsNvidiaGPUsPromise = null;
|
||||
|
||||
function loadMetricsNvidiaGPUs() {
|
||||
if (!metricsNvidiaGPUsPromise) {
|
||||
metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
})
|
||||
.then(function(list) { return Array.isArray(list) ? list : []; })
|
||||
.catch(function() { return []; });
|
||||
}
|
||||
return metricsNvidiaGPUsPromise;
|
||||
}
|
||||
|
||||
function metricsGPUNameMap(list) {
|
||||
const out = {};
|
||||
(list || []).forEach(function(gpu) {
|
||||
const idx = Number(gpu.index);
|
||||
if (!Number.isFinite(idx) || !gpu.name) return;
|
||||
out[idx] = gpu.name;
|
||||
});
|
||||
return out;
|
||||
}
|
||||
|
||||
function metricsGPUDisplayLabel(idx, names) {
|
||||
const name = names && names[idx];
|
||||
return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
|
||||
}
|
||||
|
||||
function loadGPUChartModePreference() {
|
||||
try {
|
||||
return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function saveGPUChartModePreference(perGPU) {
|
||||
try {
|
||||
sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
function refreshChartImage(el) {
|
||||
if (!el || el.dataset.loading === '1') return;
|
||||
if (el.offsetParent === null) return;
|
||||
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||
const probe = new Image();
|
||||
el.dataset.baseSrc = baseSrc;
|
||||
el.dataset.loading = '1';
|
||||
probe.onload = function() {
|
||||
el.src = nextSrc;
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.onerror = function() {
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.src = nextSrc;
|
||||
}
|
||||
|
||||
function refreshCharts() {
|
||||
document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
|
||||
}
|
||||
|
||||
function gpuIndices(rows) {
|
||||
const seen = {};
|
||||
const out = [];
|
||||
(rows || []).forEach(function(row) {
|
||||
const idx = Number(row.index);
|
||||
if (!Number.isFinite(idx) || seen[idx]) return;
|
||||
seen[idx] = true;
|
||||
out.push(idx);
|
||||
});
|
||||
return out.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
|
||||
function renderGPUOverviewCards(indices, names) {
|
||||
const host = document.getElementById('gpu-metrics-by-gpu');
|
||||
if (!host) return;
|
||||
host.innerHTML = indices.map(function(idx) {
|
||||
const label = metricsGPUDisplayLabel(idx, names);
|
||||
return '<div class="card" style="margin-bottom:16px">' +
|
||||
'<div class="card-head">' + label + ' — Overview</div>' +
|
||||
'<div class="card-body" style="padding:8px">' +
|
||||
'<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
|
||||
'</div></div>';
|
||||
}).join('');
|
||||
}
|
||||
|
||||
function applyGPUChartMode() {
|
||||
const perMetric = document.getElementById('gpu-metrics-by-metric');
|
||||
const perGPU = document.getElementById('gpu-metrics-by-gpu');
|
||||
const toggle = document.getElementById('gpu-chart-toggle');
|
||||
const gpuModePerGPU = !!(toggle && toggle.checked);
|
||||
if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
|
||||
if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
|
||||
}
|
||||
|
||||
function syncMetricsLayout(d) {
|
||||
const fanCard = document.getElementById('card-server-fans');
|
||||
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
|
||||
const section = document.getElementById('gpu-metrics-section');
|
||||
const summary = document.getElementById('gpu-metrics-summary');
|
||||
const indices = gpuIndices(d.gpus);
|
||||
loadMetricsNvidiaGPUs().then(function(gpus) {
|
||||
const names = metricsGPUNameMap(gpus);
|
||||
if (section) section.style.display = indices.length > 0 ? '' : 'none';
|
||||
if (summary) {
|
||||
summary.textContent = indices.length > 0
|
||||
? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
|
||||
: 'No GPUs detected in live metrics.';
|
||||
}
|
||||
const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
|
||||
if (nextKey !== gpuChartKey) {
|
||||
renderGPUOverviewCards(indices, names);
|
||||
gpuChartKey = nextKey;
|
||||
}
|
||||
applyGPUChartMode();
|
||||
});
|
||||
}
|
||||
|
||||
function loadMetricsLayout() {
|
||||
fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
|
||||
}
|
||||
|
||||
const gpuChartToggle = document.getElementById('gpu-chart-toggle');
|
||||
if (gpuChartToggle) {
|
||||
gpuChartToggle.checked = loadGPUChartModePreference();
|
||||
}
|
||||
applyGPUChartMode();
|
||||
|
||||
if (gpuChartToggle) {
|
||||
gpuChartToggle.addEventListener('change', function() {
|
||||
saveGPUChartModePreference(!!gpuChartToggle.checked);
|
||||
applyGPUChartMode();
|
||||
refreshCharts();
|
||||
});
|
||||
}
|
||||
|
||||
loadMetricsLayout();
|
||||
setInterval(refreshCharts, 3000);
|
||||
setInterval(loadMetricsLayout, 5000);
|
||||
</script>`
|
||||
}
|
||||
213
audit/internal/webui/page_network_services.go
Normal file
213
audit/internal/webui/page_network_services.go
Normal file
@@ -0,0 +1,213 @@
|
||||
package webui
|
||||
|
||||
import "html"
|
||||
|
||||
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
|
||||
func renderNetworkInline() string {
|
||||
return `<div id="net-pending" style="display:none" class="alert alert-warn">
|
||||
<strong>⚠ Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
|
||||
<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
|
||||
<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
|
||||
</div>
|
||||
<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div class="grid2" style="margin-top:16px">
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
|
||||
<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
|
||||
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||
<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div>
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
|
||||
<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
|
||||
<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
|
||||
<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
|
||||
<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
|
||||
<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
|
||||
<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
|
||||
<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
var _netCountdownTimer = null;
|
||||
var _netRefreshTimer = null;
|
||||
const NET_ROLLBACK_SECS = 60;
|
||||
function loadNetwork() {
|
||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||
const rows = (d.interfaces||[]).map(i =>
|
||||
'<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
|
||||
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
||||
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||
).join('');
|
||||
document.getElementById('iface-table').innerHTML =
|
||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
else hideNetPending();
|
||||
}).catch(function() {});
|
||||
}
|
||||
function selectIface(iface) {
|
||||
document.getElementById('dhcp-iface').value = iface;
|
||||
document.getElementById('st-iface').value = iface;
|
||||
}
|
||||
function toggleIface(iface, currentState) {
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||
loadNetwork();
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function hideNetPending() {
|
||||
const el = document.getElementById('net-pending');
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
_netCountdownTimer = null;
|
||||
el.style.display = 'none';
|
||||
}
|
||||
function showNetPending(secs) {
|
||||
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||
const el = document.getElementById('net-pending');
|
||||
el.style.display = 'block';
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
let remaining = secs;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
_netCountdownTimer = setInterval(function() {
|
||||
remaining--;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||
}, 1000);
|
||||
}
|
||||
function confirmNetChange() {
|
||||
hideNetPending();
|
||||
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function rollbackNetChange() {
|
||||
hideNetPending();
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function runDHCP() {
|
||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function setStatic() {
|
||||
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||
interface: document.getElementById('st-iface').value,
|
||||
address: document.getElementById('st-addr').value,
|
||||
prefix: document.getElementById('st-prefix').value,
|
||||
gateway: document.getElementById('st-gw').value,
|
||||
dns: dns,
|
||||
})}).then(r=>r.json()).then(d => {
|
||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
loadNetwork();
|
||||
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNetwork() string {
|
||||
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
|
||||
renderNetworkInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="svc-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function loadServices() {
|
||||
fetch('/api/services').then(r=>r.json()).then(svcs => {
|
||||
const rows = svcs.map(s => {
|
||||
const st = s.state||'unknown';
|
||||
const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
|
||||
const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
|
||||
const body = (s.body||'').replace(/</g,'<').replace(/>/g,'>');
|
||||
return '<tr>' +
|
||||
'<td style="white-space:nowrap">'+s.name+'</td>' +
|
||||
'<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
|
||||
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
||||
'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||
'</td></tr>';
|
||||
}).join('');
|
||||
document.getElementById('svc-table').innerHTML =
|
||||
'<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
function toggleBody(id) {
|
||||
const el = document.getElementById(id);
|
||||
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||
}
|
||||
function svcAction(btn, name, action) {
|
||||
var label = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = '...';
|
||||
var out = document.getElementById('svc-out');
|
||||
var term = document.getElementById('svc-terminal');
|
||||
var statusEl = document.getElementById('svc-out-status');
|
||||
var labelEl = document.getElementById('svc-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = action + ' ' + name;
|
||||
term.textContent = 'Running...';
|
||||
statusEl.textContent = '';
|
||||
statusEl.style.color = '';
|
||||
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
term.textContent = d.output || d.error || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (d.status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
btn.textContent = label;
|
||||
btn.disabled = false;
|
||||
setTimeout(loadServices, 800);
|
||||
}).catch(e => {
|
||||
term.textContent = 'Request failed: ' + e;
|
||||
statusEl.textContent = '✗ error';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
btn.textContent = label;
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadServices();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderServices() string {
|
||||
return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
|
||||
renderServicesInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
663
audit/internal/webui/page_validate.go
Normal file
663
audit/internal/webui/page_validate.go
Normal file
@@ -0,0 +1,663 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
Storage string
|
||||
NVIDIA string
|
||||
AMD string
|
||||
NvidiaGPUCount int
|
||||
AMDGPUCount int
|
||||
}
|
||||
|
||||
func validateFmtDur(secs int) string {
|
||||
if secs < 120 {
|
||||
return fmt.Sprintf("~%d s", secs)
|
||||
}
|
||||
mins := (secs + 29) / 60
|
||||
return fmt.Sprintf("~%d min", mins)
|
||||
}
|
||||
|
||||
func validateTotalValidateSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUValidateSec +
|
||||
platform.SATEstimatedMemoryValidateSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
if n > 0 {
|
||||
total += platform.SATEstimatedNvidiaGPUValidateSec
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func validateTotalStressSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUStressSec +
|
||||
platform.SATEstimatedMemoryStressSec +
|
||||
platform.SATEstimatedNvidiaPulseTestSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
if n > 0 {
|
||||
total += platform.SATEstimatedNvidiaGPUStressSec +
|
||||
platform.SATEstimatedNvidiaTargetedStressSec +
|
||||
platform.SATEstimatedNvidiaTargetedPowerSec
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
</div>
|
||||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satStressMode() {
|
||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||
}
|
||||
function satModeChanged() {
|
||||
const stress = satStressMode();
|
||||
[
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
card.style.opacity = stress ? '1' : '0.5';
|
||||
const hint = document.getElementById(item.hint);
|
||||
if (hint) hint.style.display = stress ? 'none' : '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
if (!satNvidiaGPUsPromise) {
|
||||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||
.then(r => {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
})
|
||||
.then(list => Array.isArray(list) ? list : []);
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row">'
|
||||
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectNoGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satLoadGPUs() {
|
||||
loadSatNvidiaGPUs().then(function(gpus) {
|
||||
satRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) {
|
||||
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
}
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satGPUDisplayName(gpu) {
|
||||
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||
return 'GPU ' + idx + ' — ' + name;
|
||||
}
|
||||
function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
body.stress_mode = satStressMode();
|
||||
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||
if (overrides) {
|
||||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||
}
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||
.then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(function(resolve) {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', function(e) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = function() {
|
||||
if (satES) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) {
|
||||
return runSATWithOverrides(target, null);
|
||||
}
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides)
|
||||
.then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
const nvidiaPerGPUTargets = [];
|
||||
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
function satAllGPUIndicesForMulti() {
|
||||
return Promise.resolve(satSelectedGPUIndices());
|
||||
}
|
||||
function expandSATTarget(target) {
|
||||
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||
});
|
||||
}
|
||||
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||
return Promise.resolve([{target: target}]);
|
||||
}
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||||
target: target,
|
||||
overrides: {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||
},
|
||||
label: satGPUDisplayName(gpu),
|
||||
})));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Running AMD validate set one by one...\n';
|
||||
const labels = satLabels();
|
||||
const runNext = (idx) => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const target = targets[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||||
return enqueueSATTarget(target)
|
||||
.then(d => {
|
||||
return streamSATTask(d.task_id, labels[target], false);
|
||||
}).then(function() {
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = 1;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
return !(btn && btn.disabled);
|
||||
});
|
||||
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
|
||||
const expanded = [];
|
||||
for (let cycle = 0; cycle < cycles; cycle++) {
|
||||
groups.forEach(group => group.forEach(item => expanded.push(item)));
|
||||
}
|
||||
const total = expanded.length;
|
||||
let enqueued = 0;
|
||||
if (!total) {
|
||||
status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const runNext = (idx) => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides)
|
||||
.then(() => {
|
||||
enqueued++;
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}).catch(err => {
|
||||
status.textContent = 'Error: ' + err.message;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||
});
|
||||
satLoadGPUs();
|
||||
function disableSATAMDOptions(reason) {
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||
const cb = document.getElementById(id);
|
||||
if (!cb) return;
|
||||
cb.disabled = true;
|
||||
cb.checked = false;
|
||||
cb.title = reason;
|
||||
});
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true;
|
||||
btn.title = reason;
|
||||
btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||||
unknown := "Audit snapshot not loaded."
|
||||
out := validateInventory{
|
||||
CPU: unknown,
|
||||
Memory: unknown,
|
||||
Storage: unknown,
|
||||
NVIDIA: unknown,
|
||||
AMD: unknown,
|
||||
}
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &snap); err != nil {
|
||||
return out
|
||||
}
|
||||
|
||||
cpuCounts := map[string]int{}
|
||||
cpuTotal := 0
|
||||
for _, cpu := range snap.Hardware.CPUs {
|
||||
if cpu.Present != nil && !*cpu.Present {
|
||||
continue
|
||||
}
|
||||
cpuTotal++
|
||||
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
memCounts := map[string]int{}
|
||||
memTotal := 0
|
||||
for _, dimm := range snap.Hardware.Memory {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
memTotal++
|
||||
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
storageCounts := map[string]int{}
|
||||
storageTotal := 0
|
||||
for _, dev := range snap.Hardware.Storage {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
storageTotal++
|
||||
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
nvidiaCounts := map[string]int{}
|
||||
nvidiaTotal := 0
|
||||
amdCounts := map[string]int{}
|
||||
amdTotal := 0
|
||||
for _, dev := range snap.Hardware.PCIeDevices {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
if validateIsVendorGPU(dev, "nvidia") {
|
||||
nvidiaTotal++
|
||||
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
if validateIsVendorGPU(dev, "amd") {
|
||||
amdTotal++
|
||||
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
}
|
||||
|
||||
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
|
||||
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
|
||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||
out.NvidiaGPUCount = nvidiaTotal
|
||||
out.AMDGPUCount = amdTotal
|
||||
return out
|
||||
}
|
||||
|
||||
func renderValidateCardBody(devices, description, commands, settings string) string {
|
||||
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
|
||||
}
|
||||
|
||||
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
|
||||
if total == 0 {
|
||||
return "0 " + unit + "s detected."
|
||||
}
|
||||
keys := make([]string, 0, len(models))
|
||||
for key := range models {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
|
||||
}
|
||||
label := unit
|
||||
if total != 1 {
|
||||
label += "s"
|
||||
}
|
||||
if len(parts) == 1 {
|
||||
return parts[0] + " " + label
|
||||
}
|
||||
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||
}
|
||||
|
||||
func addValidateModel(counts map[string]int, name string) {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
name = "unknown"
|
||||
}
|
||||
counts[name]++
|
||||
}
|
||||
|
||||
func validateTrimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func validateFirstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||
return false
|
||||
}
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||
case "amd":
|
||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||
if strings.TrimSpace(headerActions) != "" {
|
||||
actions += headerActions
|
||||
}
|
||||
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
|
||||
label, actions, body)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
|
||||
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
||||
const metricsChartWindow = 360
|
||||
|
||||
// metricsDownsampleAge is the age after which old metrics rows are downsampled
|
||||
// to 1 sample per minute. Data fresher than this is kept at full resolution.
|
||||
const metricsDownsampleAge = 2 * time.Hour
|
||||
|
||||
// metricsRetainWindow is the total retention period for metrics rows.
|
||||
// Rows older than this are deleted entirely by the background compactor.
|
||||
const metricsRetainWindow = 48 * time.Hour
|
||||
|
||||
var metricsCollectInterval = 5 * time.Second
|
||||
|
||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||
@@ -261,7 +269,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
|
||||
mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
|
||||
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||
|
||||
// Tasks
|
||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||
@@ -333,13 +345,24 @@ func (h *handler) startMetricsCollector() {
|
||||
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
||||
ticker := time.NewTicker(metricsCollectInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
sample := platform.SampleLiveMetrics()
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
pruneTicker := time.NewTicker(time.Hour)
|
||||
defer pruneTicker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
sample := platform.SampleLiveMetrics()
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
}
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
case <-pruneTicker.C:
|
||||
if h.metricsDB != nil {
|
||||
now := time.Now().UTC()
|
||||
_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
|
||||
_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
|
||||
}
|
||||
}
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -573,12 +596,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
timeline := metricsTimelineSegments(samples, time.Now())
|
||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||
var overviewOk bool
|
||||
var buf []byte
|
||||
buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if !ok {
|
||||
if !overviewOk {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
@@ -587,23 +612,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
_, _ = w.Write(buf)
|
||||
return
|
||||
}
|
||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||
if !ok {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
buf, err := renderMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMin,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
var buf []byte
|
||||
if stacked {
|
||||
buf, err = renderStackedMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
} else {
|
||||
buf, err = renderMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMin,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
}
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
@@ -613,12 +652,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var title string
|
||||
var yMin, yMax *float64
|
||||
labels := sampleTimeLabels(samples)
|
||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
|
||||
labels = sampleTimeLabels(samples)
|
||||
|
||||
switch {
|
||||
case path == "server-load":
|
||||
@@ -655,12 +690,19 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
power := make([]float64, len(samples))
|
||||
label := "Power W"
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
if strings.TrimSpace(s.PowerSource) != "" {
|
||||
label = fmt.Sprintf("Power W · %s", s.PowerSource)
|
||||
if strings.TrimSpace(s.PowerMode) != "" {
|
||||
label += fmt.Sprintf(" (%s)", s.PowerMode)
|
||||
}
|
||||
}
|
||||
}
|
||||
power = normalizePowerSeries(power)
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
names = []string{label}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(power)
|
||||
|
||||
@@ -705,7 +747,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
case strings.HasPrefix(path, "gpu/"):
|
||||
idx, sub, ok := parseGPUChartPath(path)
|
||||
if !ok {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
switch sub {
|
||||
case "load":
|
||||
@@ -713,7 +755,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
if util == nil && mem == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||
names = []string{"Load %", "Mem %"}
|
||||
@@ -723,7 +765,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
title = gpuDisplayLabel(idx) + " Temperature"
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
if temp == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{temp}
|
||||
names = []string{"Temp °C"}
|
||||
@@ -733,7 +775,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||
if clock == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{clock}
|
||||
names = []string{"Core Clock MHz"}
|
||||
@@ -742,7 +784,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||
if clock == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{clock}
|
||||
names = []string{"Memory Clock MHz"}
|
||||
@@ -751,7 +793,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
title = gpuDisplayLabel(idx) + " Power"
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
if power == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
@@ -759,10 +801,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
}
|
||||
|
||||
default:
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
|
||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
||||
return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
|
||||
}
|
||||
|
||||
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
||||
@@ -928,6 +970,37 @@ func normalizePowerSeries(ds []float64) []float64 {
|
||||
return out
|
||||
}
|
||||
|
||||
// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
|
||||
func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
|
||||
seen := map[int]struct{}{}
|
||||
for _, s := range samples {
|
||||
for _, p := range s.PSUs {
|
||||
seen[p.Slot] = struct{}{}
|
||||
}
|
||||
}
|
||||
slots := make([]int, 0, len(seen))
|
||||
for s := range seen {
|
||||
slots = append(slots, s)
|
||||
}
|
||||
sort.Ints(slots)
|
||||
return slots
|
||||
}
|
||||
|
||||
// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
|
||||
func psuStackedTotal(datasets [][]float64) []float64 {
|
||||
if len(datasets) == 0 {
|
||||
return nil
|
||||
}
|
||||
n := len(datasets[0])
|
||||
total := make([]float64, n)
|
||||
for _, ds := range datasets {
|
||||
for i, v := range ds {
|
||||
total[i] += v
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func normalizeFanSeries(ds []float64) []float64 {
|
||||
if len(ds) == 0 {
|
||||
return nil
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestChartLegendNumber(t *testing.T) {
|
||||
@@ -78,6 +79,16 @@ func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
|
||||
row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
|
||||
if row.Status != "WARNING" {
|
||||
t.Fatalf("status=%q want WARNING", row.Status)
|
||||
}
|
||||
if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
|
||||
t.Fatalf("issue=%q", row.Issue)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
@@ -109,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
@@ -153,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
@@ -198,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||
if !ok {
|
||||
t.Fatal("gpu-all-clock returned ok=false")
|
||||
}
|
||||
@@ -409,6 +420,49 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
|
||||
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: start,
|
||||
PSUs: []platform.PSUReading{
|
||||
{Slot: 1, PowerW: 120},
|
||||
{Slot: 2, PowerW: 130},
|
||||
},
|
||||
PowerW: 250,
|
||||
PowerSource: "sdr_psu_input",
|
||||
PowerMode: "autotuned",
|
||||
},
|
||||
{
|
||||
Timestamp: start.Add(time.Minute),
|
||||
PSUs: []platform.PSUReading{
|
||||
{Slot: 1, PowerW: 140},
|
||||
{Slot: 2, PowerW: 135},
|
||||
},
|
||||
PowerW: 275,
|
||||
PowerSource: "sdr_psu_input",
|
||||
PowerMode: "autotuned",
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("expected server-power chart data")
|
||||
}
|
||||
if title != "System Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
if stacked {
|
||||
t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
|
||||
}
|
||||
if len(datasets) != 1 || len(names) != 1 {
|
||||
t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
|
||||
}
|
||||
if names[0] != "Power W · sdr_psu_input (autotuned)" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||
@@ -637,8 +691,14 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
`href="/benchmark"`,
|
||||
`id="benchmark-gpu-list"`,
|
||||
`/api/gpu/nvidia`,
|
||||
`/api/benchmark/nvidia/run`,
|
||||
`/api/bee-bench/nvidia/perf/run`,
|
||||
`/api/bee-bench/nvidia/power/run`,
|
||||
`/api/bee-bench/nvidia/autotune/run`,
|
||||
`/api/bee-bench/nvidia/autotune/status`,
|
||||
`benchmark-run-nccl`,
|
||||
`Run Performance Benchmark`,
|
||||
`Run Power / Thermal Fit`,
|
||||
`Autotune`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||
@@ -649,7 +709,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -691,10 +751,10 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
body := rec.Body.String()
|
||||
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Perf Results`,
|
||||
`Composite score by saved benchmark run and GPU.`,
|
||||
`GPU #0 — NVIDIA H100 PCIe`,
|
||||
`GPU #1 — NVIDIA H100 PCIe`,
|
||||
`GPU 0`,
|
||||
`GPU 1`,
|
||||
`#1`,
|
||||
wantTime,
|
||||
`1176.25`,
|
||||
@@ -730,6 +790,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`Validate and Stress:`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`nvbandwidth runs all built-in tests without a time limit`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
@@ -1094,6 +1174,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
// Runtime Health card — LiveCD checks only
|
||||
`Runtime Health`,
|
||||
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||
`Export Directory`,
|
||||
@@ -1102,16 +1183,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
`CUDA / ROCm`,
|
||||
`Required Utilities`,
|
||||
`Bee Services`,
|
||||
`<td>CPU</td>`,
|
||||
`<td>Memory</td>`,
|
||||
`<td>Storage</td>`,
|
||||
`<td>GPU</td>`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
`cpu SAT: FAILED`,
|
||||
`storage SAT: FAILED`,
|
||||
`sat:nvidia`,
|
||||
// Hardware Summary card — component health badges
|
||||
`Hardware Summary`,
|
||||
`>CPU<`,
|
||||
`>Memory<`,
|
||||
`>Storage<`,
|
||||
`>GPU<`,
|
||||
`>PSU<`,
|
||||
`badge-warn`, // cpu Warning badge
|
||||
`badge-err`, // storage Critical badge
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||
|
||||
@@ -7,14 +7,43 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
recoverLoopMaxDelay = 60 * time.Second
|
||||
recoverLoopResetAfter = 30 * time.Second
|
||||
)
|
||||
|
||||
// goRecoverLoop starts fn in a goroutine, restarting after panics.
|
||||
// restartDelay is the initial delay; successive panics double it up to
|
||||
// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
|
||||
// successfully for recoverLoopResetAfter without panicking.
|
||||
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||
go func() {
|
||||
delay := restartDelay
|
||||
consecutive := 0
|
||||
for {
|
||||
if !runRecoverable(name, fn) {
|
||||
start := time.Now()
|
||||
panicked := runRecoverable(name, fn)
|
||||
if !panicked {
|
||||
return
|
||||
}
|
||||
if restartDelay > 0 {
|
||||
time.Sleep(restartDelay)
|
||||
consecutive++
|
||||
if time.Since(start) >= recoverLoopResetAfter {
|
||||
delay = restartDelay
|
||||
consecutive = 1
|
||||
}
|
||||
slog.Warn("goroutine restarting after panic",
|
||||
"component", name,
|
||||
"consecutive_panics", consecutive,
|
||||
"next_delay", delay,
|
||||
)
|
||||
if delay > 0 {
|
||||
time.Sleep(delay)
|
||||
}
|
||||
if delay < recoverLoopMaxDelay {
|
||||
delay *= 2
|
||||
if delay > recoverLoopMaxDelay {
|
||||
delay = recoverLoopMaxDelay
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
|
||||
}
|
||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||
}
|
||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||
if !ok {
|
||||
return "", nil, false
|
||||
}
|
||||
buf, err := renderMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMin,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
var buf []byte
|
||||
var err error
|
||||
if stacked {
|
||||
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||
} else {
|
||||
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||
}
|
||||
if err != nil {
|
||||
return "", nil, false
|
||||
}
|
||||
@@ -233,6 +229,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||
b.WriteString(benchmarkCard)
|
||||
}
|
||||
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||
b.WriteString(powerCard)
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
@@ -251,7 +250,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
}
|
||||
|
||||
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia-bench-perf":
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
@@ -263,7 +264,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
return ""
|
||||
}
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Benchmark Results",
|
||||
"Perf Results",
|
||||
"Composite score for this benchmark task.",
|
||||
"No benchmark results were saved for this task.",
|
||||
columns,
|
||||
@@ -271,15 +272,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
)
|
||||
}
|
||||
|
||||
func renderTaskPowerResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-bench-power" {
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
raw, err := os.ReadFile(resultPath)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
var result platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
|
||||
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
|
||||
}
|
||||
b.WriteString(`</table></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func taskBenchmarkResultPath(logText string) string {
|
||||
archivePath := taskArchivePathFromLog(logText)
|
||||
if archivePath == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
if runDir == archivePath {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(runDir, "result.json")
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,9 @@ const (
|
||||
var taskNames = map[string]string{
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
||||
"nvidia-benchmark": "NVIDIA Benchmark",
|
||||
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
|
||||
"nvidia-bench-power": "NVIDIA Bee Bench Power",
|
||||
"nvidia-bench-autotune": "NVIDIA Bee Bench Power Source Autotune",
|
||||
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
||||
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
||||
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
||||
@@ -118,13 +120,18 @@ type taskParams struct {
|
||||
StressMode bool `json:"stress_mode,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
|
||||
SizeMB int `json:"size_mb,omitempty"`
|
||||
Passes int `json:"passes,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
@@ -151,6 +158,38 @@ type burnPreset struct {
|
||||
DurationSec int
|
||||
}
|
||||
|
||||
type nvidiaRampSpec struct {
|
||||
DurationSec int
|
||||
StaggerSeconds int
|
||||
TotalDurationSec int
|
||||
}
|
||||
|
||||
func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
|
||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||
case "overnight":
|
||||
return 1024, 2
|
||||
case "acceptance":
|
||||
return 1024, 1
|
||||
case "smoke":
|
||||
return 256, 1
|
||||
}
|
||||
if stress {
|
||||
return 512, 1
|
||||
}
|
||||
return 256, 1
|
||||
}
|
||||
|
||||
func taskMayLeaveOrphanWorkers(target string) bool {
|
||||
switch strings.TrimSpace(strings.ToLower(target)) {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
|
||||
"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func resolveBurnPreset(profile string) burnPreset {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
@@ -162,6 +201,45 @@ func resolveBurnPreset(profile string) burnPreset {
|
||||
}
|
||||
}
|
||||
|
||||
func resolveNvidiaRampPlan(profile string, enabled bool, selected []int) (nvidiaRampSpec, error) {
|
||||
base := resolveBurnPreset(profile).DurationSec
|
||||
plan := nvidiaRampSpec{
|
||||
DurationSec: base,
|
||||
TotalDurationSec: base,
|
||||
}
|
||||
if !enabled {
|
||||
return plan, nil
|
||||
}
|
||||
count := len(selected)
|
||||
if count == 0 {
|
||||
return nvidiaRampSpec{}, fmt.Errorf("staggered NVIDIA burn requires explicit GPU selection")
|
||||
}
|
||||
if count == 1 {
|
||||
return plan, nil
|
||||
}
|
||||
|
||||
switch profile {
|
||||
case "acceptance":
|
||||
plan.StaggerSeconds = 10 * 60
|
||||
plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
|
||||
case "overnight":
|
||||
plan.StaggerSeconds = 60 * 60
|
||||
plan.TotalDurationSec = 8 * 60 * 60
|
||||
minTotal := count * 60 * 60
|
||||
if plan.TotalDurationSec < minTotal {
|
||||
plan.TotalDurationSec = minTotal
|
||||
}
|
||||
if plan.TotalDurationSec > 10*60*60 {
|
||||
return nvidiaRampSpec{}, fmt.Errorf("overnight staggered NVIDIA burn supports at most 10 GPUs")
|
||||
}
|
||||
plan.DurationSec = plan.TotalDurationSec - plan.StaggerSeconds*(count-1)
|
||||
default:
|
||||
plan.StaggerSeconds = 2 * 60
|
||||
plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
|
||||
}
|
||||
return plan, nil
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
acceptanceCycles := []platform.PlatformStressCycle{
|
||||
{LoadSec: 85, IdleSec: 5},
|
||||
@@ -509,6 +587,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
|
||||
if err := writeTaskReportArtifacts(t); err != nil {
|
||||
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||
}
|
||||
j.closeLog()
|
||||
if t.ErrMsg != "" {
|
||||
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||
return
|
||||
@@ -537,8 +616,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
}
|
||||
a := q.opts.App
|
||||
|
||||
recovered := len(j.lines) > 0
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
if len(j.lines) > 0 {
|
||||
if recovered {
|
||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||
}
|
||||
|
||||
@@ -579,7 +659,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
dur = 300
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-benchmark":
|
||||
case "nvidia-bench-perf":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
@@ -591,7 +671,32 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-autotune":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
}, t.params.BenchmarkKind, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -601,7 +706,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||
case "nvidia-targeted-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -633,15 +749,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: platform.NvidiaStressLoaderNCCL,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
}, j.append)
|
||||
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -651,21 +759,31 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
sizeMB, passes := 256, 1
|
||||
if t.params.StressMode {
|
||||
sizeMB, passes = 1024, 3
|
||||
}
|
||||
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
@@ -921,6 +1039,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
@@ -948,6 +1069,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
@@ -1052,10 +1176,13 @@ func (q *taskQueue) loadLocked() {
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker) survive the restart in their own
|
||||
// process groups and cannot be cancelled retroactively. Mark the
|
||||
// task as failed so the user can decide whether to re-run it
|
||||
// rather than blindly re-launching duplicate workers.
|
||||
// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
|
||||
// their own process groups. Kill any matching stale workers before
|
||||
// marking the task failed so the next GPU test does not inherit a
|
||||
// busy DCGM slot or duplicate workers.
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
_ = platform.KillTestWorkers()
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
|
||||
@@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
|
||||
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-bench",
|
||||
Name: "NVIDIA Benchmark",
|
||||
Target: "nvidia-benchmark",
|
||||
Name: "NVIDIA Bee Bench Perf",
|
||||
Target: "nvidia-bench-perf",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||
ArtifactsDir: artifactsDir,
|
||||
}
|
||||
ensureTaskReportPaths(task)
|
||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
|
||||
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -420,9 +420,9 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
}
|
||||
html := string(body)
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Perf Results`,
|
||||
`Composite score for this benchmark task.`,
|
||||
`GPU #0 — NVIDIA H100 PCIe`,
|
||||
`GPU 0`,
|
||||
`1176.25`,
|
||||
} {
|
||||
if !strings.Contains(html, needle) {
|
||||
@@ -491,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveNvidiaRampPlan(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
profile string
|
||||
enabled bool
|
||||
selected []int
|
||||
want nvidiaRampSpec
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "disabled uses base preset",
|
||||
profile: "acceptance",
|
||||
selected: []int{0, 1},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "smoke ramp uses two minute steps",
|
||||
profile: "smoke",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
|
||||
},
|
||||
{
|
||||
name: "acceptance ramp uses ten minute steps",
|
||||
profile: "acceptance",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight stays at eight hours when possible",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight extends to keep one hour after final gpu",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight rejects impossible gpu count",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
|
||||
wantErr: "at most 10 GPUs",
|
||||
},
|
||||
{
|
||||
name: "enabled requires explicit selection",
|
||||
profile: "smoke",
|
||||
enabled: true,
|
||||
wantErr: "requires explicit GPU selection",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
|
||||
if tc.wantErr != "" {
|
||||
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
|
||||
t.Fatalf("err=%v want substring %q", err, tc.wantErr)
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("resolveNvidiaRampPlan error: %v", err)
|
||||
}
|
||||
if got != tc.want {
|
||||
t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||
tests := []struct {
|
||||
loader string
|
||||
@@ -595,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
|
||||
var gotSizeMB, gotPasses int
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "mem-validate-1",
|
||||
Name: "Memory SAT",
|
||||
Target: "memory",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{StressMode: true},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runMemoryAcceptancePackCtx
|
||||
runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
|
||||
gotSizeMB = sizeMB
|
||||
gotPasses = passes
|
||||
return "/tmp/memory-validate.tar.gz", nil
|
||||
}
|
||||
defer func() { runMemoryAcceptancePackCtx = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotSizeMB != 512 || gotPasses != 1 {
|
||||
t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
|
||||
@@ -1,5 +1,34 @@
|
||||
# Benchmark clock calibration research
|
||||
|
||||
## Benchmark methodology versioning
|
||||
|
||||
Every benchmark methodology change must bump the benchmark version constant in
|
||||
source code by exactly `+1`.
|
||||
|
||||
Methodology change means any change that affects comparability of benchmark
|
||||
results, including for example:
|
||||
- phase durations or phase order
|
||||
- enabled/disabled precisions
|
||||
- fallback rules
|
||||
- normalization rules
|
||||
- score formulas or weights
|
||||
- degradation thresholds
|
||||
- power calibration logic
|
||||
- thermal/power penalty logic
|
||||
|
||||
Requirements:
|
||||
- benchmark version must be stored in source code as an explicit version
|
||||
constant, not inferred from git tag or build metadata
|
||||
- benchmark report must always print the benchmark version
|
||||
- `result.json` must always include the benchmark version
|
||||
- results from different benchmark versions must be treated as non-comparable by
|
||||
default
|
||||
|
||||
Purpose:
|
||||
- prevent accidental comparison of runs produced by different methodologies
|
||||
- make historical benchmark archives self-describing even when detached from git
|
||||
- force deliberate version bumps whenever scoring or execution semantics change
|
||||
|
||||
## Status
|
||||
In progress. Baseline data from production servers pending.
|
||||
|
||||
|
||||
121
bible-local/docs/gpu-model-propagation.md
Normal file
121
bible-local/docs/gpu-model-propagation.md
Normal file
@@ -0,0 +1,121 @@
|
||||
# GPU Model Name Propagation
|
||||
|
||||
How GPU model names are detected, stored, and displayed throughout the project.
|
||||
|
||||
---
|
||||
|
||||
## Detection Sources
|
||||
|
||||
There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
|
||||
|
||||
### Pipeline A — Live / SAT (nvidia-smi query at runtime)
|
||||
|
||||
**File:** `audit/internal/platform/sat.go`
|
||||
|
||||
- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
|
||||
- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
|
||||
- Used by: GPU selection UI, live metrics labels, burn/stress test logic
|
||||
|
||||
### Pipeline B — Benchmark results
|
||||
|
||||
**File:** `audit/internal/platform/benchmark.go`, line 124
|
||||
|
||||
- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
|
||||
- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
|
||||
- Used by: benchmark history table, benchmark report
|
||||
|
||||
### Pipeline C — Hardware audit JSON (PCIe schema)
|
||||
|
||||
**File:** `audit/internal/schema/hardware.go`
|
||||
|
||||
- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
|
||||
- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
|
||||
- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
|
||||
- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
|
||||
|
||||
---
|
||||
|
||||
## Key Inconsistency: NVIDIA PCIe Model is Never Set
|
||||
|
||||
`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
|
||||
|
||||
This means:
|
||||
- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
|
||||
- AMD GPUs do have their model populated
|
||||
|
||||
The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
|
||||
|
||||
---
|
||||
|
||||
## Benchmark History "Unknown GPU" Issue
|
||||
|
||||
**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
|
||||
|
||||
**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
|
||||
|
||||
This happens for:
|
||||
- Older result files saved before the `Name` field was added
|
||||
- Runs where nvidia-smi query failed before the benchmark started
|
||||
|
||||
---
|
||||
|
||||
## Fallback Strings — Current State
|
||||
|
||||
| Location | File | Fallback string |
|
||||
|---|---|---|
|
||||
| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
|
||||
| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
|
||||
| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
|
||||
| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
|
||||
| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
|
||||
| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
|
||||
| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
|
||||
| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
|
||||
|
||||
**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
|
||||
|
||||
---
|
||||
|
||||
## GPU Selection UI
|
||||
|
||||
**File:** `audit/internal/webui/pages.go`
|
||||
|
||||
- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
|
||||
- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
|
||||
- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
|
||||
|
||||
This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
|
||||
|
||||
---
|
||||
|
||||
## Data Flow Summary
|
||||
|
||||
```
|
||||
nvidia-smi (live)
|
||||
└─ ListNvidiaGPUs() → NvidiaGPU.Name
|
||||
├─ GPU selection UI (always correct)
|
||||
├─ Live metrics labels (charts_svg.go)
|
||||
└─ SAT/burn status file (sat.go)
|
||||
|
||||
nvidia-smi (at benchmark start)
|
||||
└─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
|
||||
└─ BenchmarkGPUResult.Name (json:"name,omitempty")
|
||||
├─ Benchmark report
|
||||
└─ Benchmark history table columns
|
||||
|
||||
nvidia-smi / lspci (audit collection)
|
||||
└─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
|
||||
└─ Hardware summary page hwDescribeGPU()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Fixed Issues
|
||||
|
||||
All previously open items are resolved:
|
||||
|
||||
1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
|
||||
2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
|
||||
3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
|
||||
4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
|
||||
5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
|
||||
@@ -15,6 +15,41 @@ This applies to:
|
||||
- `iso/builder/config/package-lists/*.list.chroot`
|
||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||
|
||||
## Bootloader sync rule
|
||||
|
||||
The ISO has two independent bootloader configs that must be kept in sync manually:
|
||||
|
||||
| File | Used by |
|
||||
|------|---------|
|
||||
| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
|
||||
| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
|
||||
|
||||
live-build does NOT derive one from the other. Any new boot entry, kernel parameter
|
||||
change, or new mode added to one file must be manually mirrored in the other.
|
||||
|
||||
**Canonical entry list** (both files must have all of these):
|
||||
|
||||
| Label | Key params |
|
||||
|-------|-----------|
|
||||
| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
|
||||
| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
|
||||
| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
|
||||
| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
|
||||
| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
|
||||
| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
|
||||
|
||||
**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
|
||||
```
|
||||
net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
|
||||
numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
nowatchdog nosoftlockup
|
||||
```
|
||||
(fail-safe is the exception — it deliberately uses minimal params.)
|
||||
|
||||
**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
|
||||
live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
|
||||
configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
|
||||
|
||||
## Memtest rule
|
||||
|
||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
DEBIAN_VERSION=12
|
||||
DEBIAN_KERNEL_ABI=auto
|
||||
NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUBLAS_VERSION=13.1.1.3-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=1.0.0
|
||||
MEMTEST_VERSION=6.10-4
|
||||
|
||||
@@ -23,16 +23,17 @@ lb config noauto \
|
||||
--bootloaders "grub-efi,syslinux" \
|
||||
--debian-installer none \
|
||||
--archive-areas "main contrib non-free non-free-firmware" \
|
||||
--mirror-bootstrap "https://deb.debian.org/debian" \
|
||||
--mirror-chroot "https://deb.debian.org/debian" \
|
||||
--mirror-binary "https://deb.debian.org/debian" \
|
||||
--mirror-bootstrap "http://mirror.mephi.ru/debian/" \
|
||||
--mirror-chroot "http://mirror.mephi.ru/debian/" \
|
||||
--mirror-binary "http://mirror.mephi.ru/debian/" \
|
||||
--security true \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--debootstrap-options "--include=ca-certificates" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -33,9 +33,10 @@ typedef void *CUstream;
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||
#define MAX_STRESS_STREAMS 16
|
||||
#define MAX_CUBLAS_PROFILES 5
|
||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||
#define MAX_SINGLE_PRECISION_STREAMS 4
|
||||
#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
|
||||
|
||||
static const char *ptx_source =
|
||||
".version 6.0\n"
|
||||
@@ -297,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
|
||||
return stream_count;
|
||||
}
|
||||
|
||||
static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
|
||||
if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
|
||||
return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
return profile_budget_bytes;
|
||||
}
|
||||
|
||||
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||
if (!api->cuStreamDestroy) {
|
||||
return;
|
||||
@@ -643,6 +651,20 @@ static const struct profile_desc k_profiles[] = {
|
||||
CUDA_R_16F,
|
||||
CUBLAS_COMPUTE_32F_FAST_16F,
|
||||
},
|
||||
{
|
||||
"int8_tensor",
|
||||
"int8",
|
||||
75,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
128,
|
||||
CUDA_R_8I,
|
||||
CUDA_R_8I,
|
||||
CUDA_R_32I,
|
||||
CUDA_R_32I,
|
||||
CUBLAS_COMPUTE_32I,
|
||||
},
|
||||
{
|
||||
"fp8_e4m3",
|
||||
"fp8",
|
||||
@@ -689,6 +711,21 @@ static const struct profile_desc k_profiles[] = {
|
||||
#endif
|
||||
};
|
||||
|
||||
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
|
||||
|
||||
static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
|
||||
if (!(desc->enabled && cc >= desc->min_cc)) {
|
||||
return 0;
|
||||
}
|
||||
if (precision_filter != NULL) {
|
||||
return strcmp(desc->block_label, precision_filter) == 0;
|
||||
}
|
||||
/* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
|
||||
* unstable on the current benchmark fleet and can abort the whole mixed
|
||||
* pass after earlier phases already collected useful telemetry. */
|
||||
return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
|
||||
}
|
||||
|
||||
static int load_cublaslt(struct cublaslt_api *api) {
|
||||
memset(api, 0, sizeof(*api));
|
||||
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
||||
@@ -759,10 +796,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
|
||||
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||
switch (type) {
|
||||
case CUDA_R_32F:
|
||||
case CUDA_R_32I:
|
||||
return (size_t)(elements * 4u);
|
||||
case CUDA_R_16F:
|
||||
case CUDA_R_16BF:
|
||||
return (size_t)(elements * 2u);
|
||||
case CUDA_R_8I:
|
||||
case CUDA_R_8F_E4M3:
|
||||
case CUDA_R_8F_E5M2:
|
||||
return (size_t)(elements);
|
||||
@@ -775,6 +814,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||
}
|
||||
}
|
||||
|
||||
static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
|
||||
if (desc->compute_type == CUBLAS_COMPUTE_32I) {
|
||||
return CUDA_R_32I;
|
||||
}
|
||||
if (desc->compute_type == CUBLAS_COMPUTE_64F) {
|
||||
return CUDA_R_64F;
|
||||
}
|
||||
return CUDA_R_32F;
|
||||
}
|
||||
|
||||
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
||||
uint64_t row_tiles = (rows + 127u) / 128u;
|
||||
uint64_t col_tiles = (cols + 63u) / 64u;
|
||||
@@ -881,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
CUstream stream,
|
||||
size_t profile_budget_bytes,
|
||||
struct prepared_profile *out) {
|
||||
memset(out, 0, sizeof(*out));
|
||||
out->desc = *desc;
|
||||
out->stream = stream;
|
||||
|
||||
size_t bytes_per_cell = 0;
|
||||
size_t attempt_budget = profile_budget_bytes;
|
||||
|
||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
||||
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
||||
@@ -894,105 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
|
||||
out->m = dim;
|
||||
out->n = dim;
|
||||
out->k = dim;
|
||||
while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
|
||||
memset(out, 0, sizeof(*out));
|
||||
out->desc = *desc;
|
||||
out->stream = stream;
|
||||
|
||||
size_t desired_workspace = profile_budget_bytes / 8u;
|
||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||
desired_workspace = 32u * 1024u * 1024u;
|
||||
}
|
||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||
uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
|
||||
out->m = dim;
|
||||
out->n = dim;
|
||||
out->k = dim;
|
||||
|
||||
size_t a_bytes = 0;
|
||||
size_t b_bytes = 0;
|
||||
size_t c_bytes = 0;
|
||||
size_t d_bytes = 0;
|
||||
size_t scale_bytes = 0;
|
||||
while (1) {
|
||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||
size_t desired_workspace = attempt_budget / 8u;
|
||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||
desired_workspace = 32u * 1024u * 1024u;
|
||||
}
|
||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||
|
||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||
if (matrix_bytes <= profile_budget_bytes) {
|
||||
size_t remaining = profile_budget_bytes - matrix_bytes;
|
||||
out->workspace_size = desired_workspace;
|
||||
if (out->workspace_size > remaining) {
|
||||
out->workspace_size = round_down_size(remaining, 256u);
|
||||
size_t a_bytes = 0;
|
||||
size_t b_bytes = 0;
|
||||
size_t c_bytes = 0;
|
||||
size_t d_bytes = 0;
|
||||
size_t scale_bytes = 0;
|
||||
while (1) {
|
||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||
|
||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||
if (matrix_bytes <= attempt_budget) {
|
||||
size_t remaining = attempt_budget - matrix_bytes;
|
||||
out->workspace_size = desired_workspace;
|
||||
if (out->workspace_size > remaining) {
|
||||
out->workspace_size = round_down_size(remaining, 256u);
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||
break;
|
||||
}
|
||||
out->m -= (uint64_t)desc->min_multiple;
|
||||
out->n = out->m;
|
||||
out->k = out->m;
|
||||
}
|
||||
if (out->m < (uint64_t)desc->min_multiple) {
|
||||
attempt_budget /= 2u;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||
return 0;
|
||||
}
|
||||
out->m -= (uint64_t)desc->min_multiple;
|
||||
out->n = out->m;
|
||||
out->k = out->m;
|
||||
}
|
||||
|
||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
cublasOperation_t transa = CUBLAS_OP_T;
|
||||
cublasOperation_t transb = CUBLAS_OP_N;
|
||||
if (!check_cublas("set TRANSA",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||
&transa,
|
||||
sizeof(transa))) ||
|
||||
!check_cublas("set TRANSB",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||
&transb,
|
||||
sizeof(transb)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (desc->needs_scalar_scale) {
|
||||
float one = 1.0f;
|
||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||
|
||||
cudaDataType_t scale_type = matmul_scale_type(desc);
|
||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||
if (!check_cublas("set A scale ptr",
|
||||
|
||||
cublasOperation_t transa = CUBLAS_OP_T;
|
||||
cublasOperation_t transb = CUBLAS_OP_N;
|
||||
if (!check_cublas("set TRANSA",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||
&a_scale_ptr,
|
||||
sizeof(a_scale_ptr))) ||
|
||||
!check_cublas("set B scale ptr",
|
||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||
&transa,
|
||||
sizeof(transa))) ||
|
||||
!check_cublas("set TRANSB",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||
&b_scale_ptr,
|
||||
sizeof(b_scale_ptr)))) {
|
||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||
&transb,
|
||||
sizeof(transb)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (desc->needs_scalar_scale) {
|
||||
float one = 1.0f;
|
||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||
if (!check_cublas("set A scale ptr",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||
&a_scale_ptr,
|
||||
sizeof(a_scale_ptr))) ||
|
||||
!check_cublas("set B scale ptr",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||
&b_scale_ptr,
|
||||
sizeof(b_scale_ptr)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
||||
if (desc->needs_block_scale) {
|
||||
@@ -1032,78 +1089,94 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!check_cublas("create A layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||
!check_cublas("create B layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||
!check_cublas("create C layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||
!check_cublas("create D layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (out->workspace_size > 0) {
|
||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||
if (!check_cublas("create A layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||
!check_cublas("create B layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||
!check_cublas("create C layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||
!check_cublas("create D layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (out->workspace_size > 0) {
|
||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (!check_cublas("set workspace",
|
||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||
out->preference,
|
||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||
&out->workspace_size,
|
||||
sizeof(out->workspace_size)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int found = 0;
|
||||
if (check_cublas("heuristic",
|
||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||
out->op_desc,
|
||||
out->a_layout,
|
||||
out->b_layout,
|
||||
out->c_layout,
|
||||
out->d_layout,
|
||||
out->preference,
|
||||
1,
|
||||
&out->heuristic,
|
||||
&found)) &&
|
||||
found > 0) {
|
||||
out->ready = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
destroy_profile(cublas, cuda, out);
|
||||
attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
|
||||
if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!check_cublas("set workspace",
|
||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||
out->preference,
|
||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||
&out->workspace_size,
|
||||
sizeof(out->workspace_size)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int found = 0;
|
||||
if (!check_cublas("heuristic",
|
||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||
out->op_desc,
|
||||
out->a_layout,
|
||||
out->b_layout,
|
||||
out->c_layout,
|
||||
out->d_layout,
|
||||
out->preference,
|
||||
1,
|
||||
&out->heuristic,
|
||||
&found))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (found <= 0) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
out->ready = 1;
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int run_cublas_profile(cublasLtHandle_t handle,
|
||||
struct cublaslt_api *cublas,
|
||||
struct prepared_profile *profile) {
|
||||
int32_t alpha_i32 = 1;
|
||||
int32_t beta_i32 = 0;
|
||||
double alpha_f64 = 1.0;
|
||||
double beta_f64 = 0.0;
|
||||
float alpha = 1.0f;
|
||||
float beta = 0.0f;
|
||||
const void *alpha_ptr = α
|
||||
const void *beta_ptr = β
|
||||
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
|
||||
alpha_ptr = &alpha_i32;
|
||||
beta_ptr = &beta_i32;
|
||||
} else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
|
||||
alpha_ptr = &alpha_f64;
|
||||
beta_ptr = &beta_f64;
|
||||
}
|
||||
return check_cublas(profile->desc.name,
|
||||
cublas->cublasLtMatmul(handle,
|
||||
profile->op_desc,
|
||||
&alpha,
|
||||
alpha_ptr,
|
||||
(const void *)(uintptr_t)profile->a_dev,
|
||||
profile->a_layout,
|
||||
(const void *)(uintptr_t)profile->b_dev,
|
||||
profile->b_layout,
|
||||
&beta,
|
||||
beta_ptr,
|
||||
(const void *)(uintptr_t)profile->c_dev,
|
||||
profile->c_layout,
|
||||
(void *)(uintptr_t)profile->d_dev,
|
||||
@@ -1121,9 +1194,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int cc_minor,
|
||||
int seconds,
|
||||
int size_mb,
|
||||
const char *precision_filter,
|
||||
struct stress_report *report) {
|
||||
struct cublaslt_api cublas;
|
||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
|
||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
|
||||
cublasLtHandle_t handle = NULL;
|
||||
CUcontext ctx = NULL;
|
||||
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||
@@ -1133,11 +1207,12 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int active = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||
int profile_count = PROFILE_COUNT;
|
||||
int prepared_count = 0;
|
||||
size_t requested_budget = 0;
|
||||
size_t total_budget = 0;
|
||||
size_t per_profile_budget = 0;
|
||||
int budget_profiles = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||
@@ -1158,8 +1233,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count profiles matching the filter (for deciding what to run). */
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
||||
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||
planned++;
|
||||
}
|
||||
}
|
||||
@@ -1170,18 +1246,42 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count all profiles active on this GPU regardless of filter.
|
||||
* Mixed phases still divide budget across the full precision set, while
|
||||
* single-precision benchmark phases dedicate budget only to active
|
||||
* profiles matching precision_filter. */
|
||||
int planned_total = 0;
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||
planned_total++;
|
||||
}
|
||||
}
|
||||
if (planned_total < planned) {
|
||||
planned_total = planned;
|
||||
}
|
||||
budget_profiles = planned_total;
|
||||
if (precision_filter != NULL) {
|
||||
budget_profiles = planned;
|
||||
}
|
||||
if (budget_profiles <= 0) {
|
||||
budget_profiles = planned_total;
|
||||
}
|
||||
|
||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||
cuda->cuStreamCreate &&
|
||||
cuda->cuStreamDestroy) {
|
||||
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
||||
stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
|
||||
}
|
||||
if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
|
||||
stream_count = MAX_SINGLE_PRECISION_STREAMS;
|
||||
}
|
||||
if (stream_count > 1) {
|
||||
int created = 0;
|
||||
@@ -1194,18 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
}
|
||||
report->stream_count = stream_count;
|
||||
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
||||
per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
|
||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (precision_filter != NULL) {
|
||||
per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
|
||||
}
|
||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
mp_count,
|
||||
budget_profiles,
|
||||
per_profile_budget / (1024u * 1024u));
|
||||
|
||||
for (int i = 0; i < profile_count; i++) {
|
||||
@@ -1218,6 +1322,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
desc->min_cc);
|
||||
continue;
|
||||
}
|
||||
if (!profile_allowed_for_run(desc, cc, precision_filter)) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=SKIPPED benchmark_disabled\n",
|
||||
desc->name);
|
||||
continue;
|
||||
}
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
CUstream stream = streams[lane];
|
||||
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
||||
@@ -1335,10 +1446,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
#endif
|
||||
|
||||
static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
|
||||
printf("device=%s\n", report->device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
|
||||
printf("backend=%s\n", report->backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
printf("buffer_mb=%d\n", report->buffer_mb);
|
||||
printf("streams=%d\n", report->stream_count);
|
||||
printf("iterations=%lu\n", report->iterations);
|
||||
printf("checksum=%llu\n", (unsigned long long)report->checksum);
|
||||
if (report->details[0] != '\0') {
|
||||
printf("%s", report->details);
|
||||
}
|
||||
printf("status=OK\n");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int seconds = 5;
|
||||
int size_mb = 64;
|
||||
int device_index = 0;
|
||||
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
|
||||
const char *precision_plan = NULL;
|
||||
const char *precision_plan_seconds = NULL;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||
seconds = atoi(argv[++i]);
|
||||
@@ -1346,8 +1476,16 @@ int main(int argc, char **argv) {
|
||||
size_mb = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||
device_index = atoi(argv[++i]);
|
||||
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
|
||||
precision_filter = argv[++i];
|
||||
} else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
|
||||
precision_plan = argv[++i];
|
||||
} else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
|
||||
precision_plan_seconds = argv[++i];
|
||||
} else {
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
||||
fprintf(stderr,
|
||||
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
|
||||
argv[0]);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
@@ -1407,26 +1545,94 @@ int main(int argc, char **argv) {
|
||||
int ok = 0;
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
|
||||
if (precision_plan != NULL && precision_plan[0] != '\0') {
|
||||
char *plan_copy = strdup(precision_plan);
|
||||
char *plan_seconds_copy = NULL;
|
||||
int phase_seconds[32] = {0};
|
||||
int phase_seconds_count = 0;
|
||||
int phase_ok = 0;
|
||||
if (plan_copy == NULL) {
|
||||
fprintf(stderr, "failed to allocate precision plan buffer\n");
|
||||
return 1;
|
||||
}
|
||||
if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
|
||||
plan_seconds_copy = strdup(precision_plan_seconds);
|
||||
if (plan_seconds_copy == NULL) {
|
||||
free(plan_copy);
|
||||
fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
|
||||
return 1;
|
||||
}
|
||||
for (char *sec_token = strtok(plan_seconds_copy, ",");
|
||||
sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
|
||||
sec_token = strtok(NULL, ",")) {
|
||||
while (*sec_token == ' ' || *sec_token == '\t') {
|
||||
sec_token++;
|
||||
}
|
||||
if (*sec_token == '\0') {
|
||||
continue;
|
||||
}
|
||||
phase_seconds[phase_seconds_count++] = atoi(sec_token);
|
||||
}
|
||||
}
|
||||
int phase_idx = 0;
|
||||
for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
|
||||
while (*token == ' ' || *token == '\t') {
|
||||
token++;
|
||||
}
|
||||
if (*token == '\0') {
|
||||
continue;
|
||||
}
|
||||
const char *phase_name = token;
|
||||
const char *phase_filter = token;
|
||||
if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
|
||||
phase_filter = NULL;
|
||||
}
|
||||
int phase_duration = seconds;
|
||||
if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
|
||||
phase_duration = phase_seconds[phase_idx];
|
||||
}
|
||||
printf("phase_begin=%s\n", phase_name);
|
||||
fflush(stdout);
|
||||
memset(&report, 0, sizeof(report));
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
|
||||
if (ok) {
|
||||
print_stress_report(&report, device_index, phase_duration);
|
||||
phase_ok = 1;
|
||||
} else {
|
||||
printf("phase_error=%s\n", phase_name);
|
||||
if (report.details[0] != '\0') {
|
||||
printf("%s", report.details);
|
||||
if (report.details[strlen(report.details) - 1] != '\n') {
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
printf("status=FAILED\n");
|
||||
}
|
||||
printf("phase_end=%s\n", phase_name);
|
||||
fflush(stdout);
|
||||
}
|
||||
free(plan_seconds_copy);
|
||||
free(plan_copy);
|
||||
return phase_ok ? 0 : 1;
|
||||
}
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
||||
#endif
|
||||
if (!ok) {
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
|
||||
if (precision_filter != NULL) {
|
||||
fprintf(stderr,
|
||||
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
|
||||
precision_filter,
|
||||
name,
|
||||
cc_major,
|
||||
cc_minor);
|
||||
return 1;
|
||||
}
|
||||
int ptx_mb = size_mb;
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("device=%s\n", report.device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
||||
printf("backend=%s\n", report.backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
||||
printf("streams=%d\n", report.stream_count);
|
||||
printf("iterations=%lu\n", report.iterations);
|
||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
||||
if (report.details[0] != '\0') {
|
||||
printf("%s", report.details);
|
||||
}
|
||||
printf("status=OK\n");
|
||||
print_stress_report(&report, device_index, seconds);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -161,6 +161,7 @@ run_variant() {
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-e BEE_REQUIRE_MEMTEST=1 \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||
@@ -175,6 +176,7 @@ run_variant() {
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-e BEE_REQUIRE_MEMTEST=1 \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||
|
||||
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export MEMTEST_VERSION
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
@@ -125,6 +126,37 @@ resolve_iso_version() {
|
||||
resolve_audit_version
|
||||
}
|
||||
|
||||
sync_builder_workdir() {
|
||||
src_dir="$1"
|
||||
dst_dir="$2"
|
||||
|
||||
mkdir -p "$dst_dir"
|
||||
|
||||
# Historical bug: old workdirs could keep config/bootloaders/grub-pc even
|
||||
# after the source tree moved to grub-efi only. Remove bootloaders eagerly
|
||||
# so reused workdirs cannot leak stale templates into a new ISO build.
|
||||
rm -rf "$dst_dir/config/bootloaders"
|
||||
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
--exclude='.build/' \
|
||||
--exclude='*.iso' \
|
||||
--exclude='*.packages' \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"$src_dir/" "$dst_dir/"
|
||||
|
||||
if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
|
||||
echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
|
||||
echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
iso_list_files() {
|
||||
iso_path="$1"
|
||||
|
||||
@@ -202,7 +234,7 @@ dump_memtest_debug() {
|
||||
|
||||
echo "-- source bootloader templates --"
|
||||
for cfg in \
|
||||
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
|
||||
"${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
|
||||
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
|
||||
if [ -f "$cfg" ]; then
|
||||
echo " file: $cfg"
|
||||
@@ -465,6 +497,75 @@ validate_iso_memtest() {
|
||||
echo "=== memtest validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_live_boot_entries() {
|
||||
iso_path="$1"
|
||||
echo "=== validating live boot entries in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || {
|
||||
echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
|
||||
exit 1
|
||||
}
|
||||
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||
echo "ERROR: ISO reader unavailable for live boot validation" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
grub_cfg="$(mktemp)"
|
||||
isolinux_cfg="$(mktemp)"
|
||||
|
||||
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||
echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||
echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
|
||||
echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB toram entry is missing" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'linux .*boot=live ' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB live entry is missing boot=live" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
|
||||
echo "ERROR: isolinux live entry is missing boot=live" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
|
||||
echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
echo "=== live boot validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_nvidia_runtime() {
|
||||
iso_path="$1"
|
||||
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||
@@ -541,6 +642,185 @@ label memtest
|
||||
EOF
|
||||
}
|
||||
|
||||
extract_live_grub_entry() {
|
||||
cfg="$1"
|
||||
live_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
live_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
[ -n "$live_linux" ] || return 1
|
||||
[ -n "$live_initrd" ] || return 1
|
||||
|
||||
grub_kernel="$(printf '%s\n' "$live_linux" | awk '{print $2}')"
|
||||
grub_append="$(printf '%s\n' "$live_linux" | cut -d' ' -f3-)"
|
||||
grub_initrd="$(printf '%s\n' "$live_initrd" | awk '{print $2}')"
|
||||
[ -n "$grub_kernel" ] || return 1
|
||||
[ -n "$grub_append" ] || return 1
|
||||
[ -n "$grub_initrd" ] || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
load_live_build_append() {
|
||||
lb_dir="$1"
|
||||
binary_cfg="$lb_dir/config/binary"
|
||||
[ -f "$binary_cfg" ] || return 1
|
||||
|
||||
# config/binary is generated by live-build and contains shell variable
|
||||
# assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
|
||||
# shellcheck disable=SC1090
|
||||
. "$binary_cfg"
|
||||
|
||||
[ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
|
||||
live_build_append="$LB_BOOTAPPEND_LIVE"
|
||||
return 0
|
||||
}
|
||||
|
||||
extract_live_isolinux_entry() {
|
||||
cfg="$1"
|
||||
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
isolinux_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
isolinux_append="$(awk '/^[[:space:]]*append[[:space:]]+/ { sub(/^[[:space:]]*append[[:space:]]+/, ""); print; exit }' "$cfg")"
|
||||
[ -n "$isolinux_linux" ] || return 1
|
||||
[ -n "$isolinux_initrd" ] || return 1
|
||||
[ -n "$isolinux_append" ] || return 1
|
||||
|
||||
isolinux_kernel="$(printf '%s\n' "$isolinux_linux" | awk '{print $2}')"
|
||||
isolinux_initrd_path="$(printf '%s\n' "$isolinux_initrd" | awk '{print $2}')"
|
||||
[ -n "$isolinux_kernel" ] || return 1
|
||||
[ -n "$isolinux_initrd_path" ] || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
write_canonical_grub_cfg() {
|
||||
cfg="$1"
|
||||
kernel="$2"
|
||||
append_live="$3"
|
||||
initrd="$4"
|
||||
|
||||
cat > "$cfg" <<EOF
|
||||
source /boot/grub/config.cfg
|
||||
|
||||
echo ""
|
||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||
echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||
linux ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
|
||||
if [ "\${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
|
||||
if [ "\${grub_platform}" = "efi" ]; then
|
||||
menuentry "UEFI Firmware Settings" {
|
||||
fwsetup
|
||||
}
|
||||
fi
|
||||
EOF
|
||||
}
|
||||
|
||||
write_canonical_isolinux_cfg() {
|
||||
cfg="$1"
|
||||
kernel="$2"
|
||||
initrd="$3"
|
||||
append_live="$4"
|
||||
|
||||
cat > "$cfg" <<EOF
|
||||
label live-@FLAVOUR@-normal
|
||||
menu label ^EASY-BEE
|
||||
menu default
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-gsp-off
|
||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^KMS, no nomodeset)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (KMS, ^GSP=off)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
EOF
|
||||
}
|
||||
|
||||
enforce_live_build_bootloader_assets() {
|
||||
lb_dir="$1"
|
||||
grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
|
||||
grub_dir="$lb_dir/binary/boot/grub"
|
||||
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||
|
||||
if ! load_live_build_append "$lb_dir"; then
|
||||
echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
|
||||
live_build_append=""
|
||||
fi
|
||||
|
||||
if [ -f "$grub_cfg" ]; then
|
||||
if extract_live_grub_entry "$grub_cfg"; then
|
||||
mkdir -p "$grub_dir/live-theme"
|
||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
||||
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
||||
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
||||
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||
else
|
||||
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -f "$isolinux_cfg" ]; then
|
||||
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
||||
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
|
||||
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
||||
else
|
||||
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
copy_memtest_from_deb() {
|
||||
deb="$1"
|
||||
dst_boot="$2"
|
||||
@@ -775,6 +1055,7 @@ run_optional_step_sh() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
mkdir -p "${LOG_DIR}" 2>/dev/null || true
|
||||
step_log="${LOG_DIR}/${step_slug}.log"
|
||||
echo ""
|
||||
echo "=== optional step: ${step_name} ==="
|
||||
@@ -798,13 +1079,14 @@ start_build_log
|
||||
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
||||
echo "=== refreshing apt index to detect current kernel ABI ==="
|
||||
apt-get update -qq
|
||||
apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
|
||||
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
||||
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
||||
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
||||
| head -1)
|
||||
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
||||
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
||||
echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
||||
@@ -873,9 +1155,37 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
if [ -n "$fp4_type_match" ]; then
|
||||
echo "fp4_header_symbol=present"
|
||||
echo "$fp4_type_match"
|
||||
else
|
||||
echo "fp4_header_symbol=missing"
|
||||
fi
|
||||
if [ -n "$fp4_scale_match" ]; then
|
||||
echo "fp4_scale_mode_symbol=present"
|
||||
echo "$fp4_scale_match"
|
||||
else
|
||||
echo "fp4_scale_mode_symbol=missing"
|
||||
fi
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
for dep in \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
"${BUILDER_DIR}/VERSIONS"; do
|
||||
if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
|
||||
find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
@@ -889,21 +1199,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
else
|
||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||
fi
|
||||
echo "=== bee-gpu-burn compiled profile probe ==="
|
||||
if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
|
||||
echo "fp4_profile_string=present"
|
||||
else
|
||||
echo "fp4_profile_string=missing"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
|
||||
# Sync builder config into variant work dir, preserving lb cache.
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
--exclude='.build/' \
|
||||
--exclude='*.iso' \
|
||||
--exclude='*.packages' \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||
sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
|
||||
|
||||
# Share deb package cache across variants.
|
||||
# Restore: populate work dir cache from shared cache before build.
|
||||
@@ -917,86 +1225,6 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||
fi
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
|
||||
cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
|
||||
source /boot/grub/config.cfg
|
||||
|
||||
echo ""
|
||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||
echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
}
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "UEFI Firmware Settings" {
|
||||
fwsetup
|
||||
}
|
||||
fi
|
||||
EOF
|
||||
|
||||
cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
|
||||
label live-@FLAVOUR@-normal
|
||||
menu label ^EASY-BEE
|
||||
menu default
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^graphics/KMS)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ toram
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
EOF
|
||||
fi
|
||||
|
||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||
rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||
@@ -1225,6 +1453,7 @@ fi
|
||||
# --- substitute version placeholders in package list and archive ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
sed -i \
|
||||
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
@@ -1267,10 +1496,18 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
|
||||
cd "${LB_DIR}"
|
||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
|
||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||
echo "=== enforcing canonical bootloader assets ==="
|
||||
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
||||
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
|
||||
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
||||
|
||||
# --- persist deb package cache back to shared location ---
|
||||
# This allows the second variant to reuse all downloaded packages.
|
||||
@@ -1295,6 +1532,7 @@ if [ -f "$ISO_RAW" ]; then
|
||||
fi
|
||||
fi
|
||||
validate_iso_memtest "$ISO_RAW"
|
||||
validate_iso_live_boot_entries "$ISO_RAW"
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
|
||||
@@ -23,9 +23,9 @@ insmod serial
|
||||
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
||||
|
||||
insmod gfxterm
|
||||
insmod png
|
||||
|
||||
source /boot/grub/theme.cfg
|
||||
|
||||
terminal_input console serial
|
||||
terminal_output gfxterm serial
|
||||
|
||||
insmod png
|
||||
source /boot/grub/theme.cfg
|
||||
28
iso/builder/config/bootloaders/grub-efi/grub.cfg
Normal file
28
iso/builder/config/bootloaders/grub-efi/grub.cfg
Normal file
@@ -0,0 +1,28 @@
|
||||
source /boot/grub/config.cfg
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "UEFI Firmware Settings" {
|
||||
fwsetup
|
||||
}
|
||||
fi
|
||||
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
Normal file
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 78 KiB |
@@ -5,6 +5,13 @@ title-text: ""
|
||||
message-font: "Unifont Regular 16"
|
||||
terminal-font: "Unifont Regular 16"
|
||||
|
||||
#bee logo - centered, upper third of screen
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
|
||||
#help bar at the bottom
|
||||
+ label {
|
||||
top = 100%-50
|
||||
@@ -21,17 +28,17 @@ terminal-font: "Unifont Regular 16"
|
||||
+ boot_menu {
|
||||
left = 20%
|
||||
width = 60%
|
||||
top = 62%
|
||||
height = 38%-80
|
||||
top = 65%
|
||||
height = 35%-80
|
||||
item_color = "#c88000"
|
||||
item_font = "Unifont Regular 16"
|
||||
selected_item_color= "#f5a800"
|
||||
selected_item_font = "Unifont Regular 16"
|
||||
item_height = 16
|
||||
item_padding = 0
|
||||
item_height = 20
|
||||
item_padding = 2
|
||||
item_spacing = 4
|
||||
icon_width = 0
|
||||
icon_heigh = 0
|
||||
icon_height = 0
|
||||
item_icon_space = 0
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
set color_normal=light-gray/black
|
||||
set color_highlight=yellow/black
|
||||
|
||||
if [ -e /boot/grub/splash.png ]; then
|
||||
if [ -e /boot/grub/live-theme/theme.txt ]; then
|
||||
set theme=/boot/grub/live-theme/theme.txt
|
||||
else
|
||||
set menu_color_normal=yellow/black
|
||||
@@ -1,49 +0,0 @@
|
||||
source /boot/grub/config.cfg
|
||||
|
||||
echo ""
|
||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||
echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
}
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "UEFI Firmware Settings" {
|
||||
fwsetup
|
||||
}
|
||||
fi
|
||||
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
|
||||
menu default
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^graphics/KMS)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
||||
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-gsp-off
|
||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^KMS, no nomodeset)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (KMS, ^GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
|
||||
@@ -25,6 +25,7 @@ ensure_bee_console_user() {
|
||||
ensure_bee_console_user
|
||||
|
||||
# Enable common bee services
|
||||
systemctl enable bee-hpc-tuning.service
|
||||
systemctl enable bee-network.service
|
||||
systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
@@ -42,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||
# Enable GPU-vendor specific services
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||
systemctl enable bee-nvidia.service
|
||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||
@@ -55,13 +57,16 @@ fi
|
||||
# nogpu: no GPU services needed
|
||||
|
||||
# Ensure scripts are executable
|
||||
chmod +x /usr/local/bin/bee-hpc-tuning 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/bin/sh
|
||||
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
||||
set -e
|
||||
echo "=== generating bee wallpaper ==="
|
||||
mkdir -p /usr/share/bee
|
||||
|
||||
python3 - <<'PYEOF'
|
||||
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||
import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
ASCII_ART = [
|
||||
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||
]
|
||||
SUBTITLE = " Hardware Audit LiveCD"
|
||||
|
||||
FG = (0xF6, 0xD0, 0x47)
|
||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||
SHADOW = (0x5E, 0x47, 0x05)
|
||||
SUB = (0x96, 0x7A, 0x17)
|
||||
BG = (0x05, 0x05, 0x05)
|
||||
|
||||
MONO_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||
]
|
||||
SUB_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||
]
|
||||
|
||||
|
||||
def load_font(candidates, size):
|
||||
for path in candidates:
|
||||
if os.path.exists(path):
|
||||
return ImageFont.truetype(path, size)
|
||||
return ImageFont.load_default()
|
||||
|
||||
|
||||
def mono_metrics(font):
|
||||
probe = Image.new('L', (W, H), 0)
|
||||
draw = ImageDraw.Draw(probe)
|
||||
char_w = int(round(draw.textlength("M", font=font)))
|
||||
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||
char_h = bb[3] - bb[1]
|
||||
return char_w, char_h
|
||||
|
||||
|
||||
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||
width = max(len(line) for line in lines) * char_w
|
||||
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||
mask = Image.new('L', (width, height), 0)
|
||||
draw = ImageDraw.Draw(mask)
|
||||
for row, line in enumerate(lines):
|
||||
y = row * (char_h + line_gap)
|
||||
for col, ch in enumerate(line):
|
||||
if ch == ' ':
|
||||
continue
|
||||
x = col * char_w
|
||||
draw.text((x, y), ch, font=font, fill=255)
|
||||
return mask
|
||||
|
||||
|
||||
img = Image.new('RGB', (W, H), BG)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Soft amber glow under the logo without depending on font rendering.
|
||||
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||
glow_draw = ImageDraw.Draw(glow)
|
||||
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||
|
||||
TARGET_LOGO_W = 400
|
||||
max_chars = max(len(line) for line in ASCII_ART)
|
||||
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||
_probe_cw, _ = mono_metrics(_probe_font)
|
||||
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||
char_w, char_h = mono_metrics(font_logo)
|
||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||
logo_w, logo_h = logo_mask.size
|
||||
logo_x = (W - logo_w) // 2
|
||||
logo_y = 380
|
||||
|
||||
sh_off = max(1, font_size_logo // 6)
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||
|
||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||
sub_y = logo_y + logo_h + 48
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||
|
||||
img = img.convert('RGB')
|
||||
|
||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||
PYEOF
|
||||
|
||||
echo "=== wallpaper done ==="
|
||||
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/sh
|
||||
# 9011-toram-rsync.hook.chroot
|
||||
#
|
||||
# Adds rsync to the initramfs so that live-boot's toram code takes the
|
||||
# rsync --progress path instead of the silent "cp -a" fallback.
|
||||
#
|
||||
# live-boot's 9990-toram-todisk.sh already contains:
|
||||
# if [ -x /bin/rsync ]; then
|
||||
# rsync -a --progress ... 1>/dev/console
|
||||
# else
|
||||
# cp -a ... # no output
|
||||
# fi
|
||||
#
|
||||
# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
|
||||
# which copies the binary + all shared-library dependencies into the initrd.
|
||||
|
||||
set -e
|
||||
|
||||
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||
HOOK="${HOOK_DIR}/bee-rsync"
|
||||
|
||||
mkdir -p "${HOOK_DIR}"
|
||||
|
||||
cat > "${HOOK}" << 'EOF'
|
||||
#!/bin/sh
|
||||
# initramfs hook: include rsync for live-boot toram progress output
|
||||
PREREQ=""
|
||||
prereqs() { echo "$PREREQ"; }
|
||||
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||
|
||||
. /usr/share/initramfs-tools/hook-functions
|
||||
|
||||
if [ -x /usr/bin/rsync ]; then
|
||||
copy_exec /usr/bin/rsync /bin
|
||||
fi
|
||||
EOF
|
||||
|
||||
chmod +x "${HOOK}"
|
||||
|
||||
echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
|
||||
|
||||
# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
|
||||
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||
echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
|
||||
update-initramfs -u -k "${KVER}"
|
||||
echo "9011-toram-rsync: done"
|
||||
@@ -5,6 +5,8 @@ set -e
|
||||
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
|
||||
# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
|
||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||
BINARY_BOOT_DIR="binary/boot"
|
||||
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||
@@ -24,15 +26,23 @@ fail_or_warn() {
|
||||
return 0
|
||||
}
|
||||
|
||||
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
|
||||
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
|
||||
# The template already has memtest entries hardcoded, so a missing config file
|
||||
# here is not an error; validate_iso_memtest() checks the final ISO instead.
|
||||
warn_only() {
|
||||
log "WARNING: $1"
|
||||
}
|
||||
|
||||
copy_memtest_file() {
|
||||
src="$1"
|
||||
base="$(basename "$src")"
|
||||
dst="${BINARY_BOOT_DIR}/${base}"
|
||||
dst_name="${2:-$(basename "$src")}"
|
||||
dst="${BINARY_BOOT_DIR}/${dst_name}"
|
||||
|
||||
[ -f "$src" ] || return 1
|
||||
mkdir -p "${BINARY_BOOT_DIR}"
|
||||
cp "$src" "$dst"
|
||||
log "copied ${base} from ${src}"
|
||||
log "copied ${dst_name} from ${src}"
|
||||
}
|
||||
|
||||
extract_memtest_from_deb() {
|
||||
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {
|
||||
|
||||
log "extracting memtest payload from ${deb}"
|
||||
dpkg-deb -x "$deb" "$tmpdir"
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ -f "${tmpdir}/boot/${f}" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/${f}"
|
||||
fi
|
||||
done
|
||||
|
||||
# EFI binary: both 5.x and 6.x use memtest86+x64.efi
|
||||
if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
|
||||
fi
|
||||
|
||||
# BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
|
||||
if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
|
||||
elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
|
||||
fi
|
||||
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
|
||||
download_and_extract_memtest() {
|
||||
tmpdl="$(mktemp -d)"
|
||||
if [ -n "${MEMTEST_VERSION:-}" ]; then
|
||||
pkg_spec="memtest86+=${MEMTEST_VERSION}"
|
||||
else
|
||||
pkg_spec="memtest86+"
|
||||
fi
|
||||
log "downloading ${pkg_spec} from apt"
|
||||
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
|
||||
log "apt download failed, retrying after apt-get update"
|
||||
apt-get update -qq >/dev/null 2>&1 || true
|
||||
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
|
||||
fi
|
||||
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||
if [ -n "$deb" ]; then
|
||||
extract_memtest_from_deb "$deb"
|
||||
else
|
||||
log "apt download of memtest86+ failed"
|
||||
fi
|
||||
rm -rf "$tmpdl"
|
||||
}
|
||||
|
||||
ensure_memtest_binaries() {
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 1. Try files already placed by lb binary_memtest or chroot
|
||||
for root in chroot/boot /boot; do
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||
done
|
||||
# 6.x BIOS binary may lack x64 in name — copy with normalised name
|
||||
if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
|
||||
copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
|
||||
fi
|
||||
done
|
||||
|
||||
missing=0
|
||||
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 2. Try apt package cache (may be empty if lb binary_memtest already purged)
|
||||
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||
[ -d "$root" ] || continue
|
||||
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
|
||||
break
|
||||
done
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
|
||||
download_and_extract_memtest
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {
|
||||
|
||||
ensure_grub_entry() {
|
||||
[ -f "$GRUB_CFG" ] || {
|
||||
fail_or_warn "missing ${GRUB_CFG}"
|
||||
warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -114,7 +169,7 @@ EOF
|
||||
|
||||
ensure_isolinux_entry() {
|
||||
[ -f "$ISOLINUX_CFG" ] || {
|
||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
||||
warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||
# explicitly.
|
||||
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
|
||||
@@ -3,6 +3,7 @@ dmidecode
|
||||
smartmontools
|
||||
nvme-cli
|
||||
pciutils
|
||||
rsync
|
||||
ipmitool
|
||||
util-linux
|
||||
e2fsprogs
|
||||
|
||||
14
iso/overlay/etc/systemd/system/bee-hpc-tuning.service
Normal file
14
iso/overlay/etc/systemd/system/bee-hpc-tuning.service
Normal file
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=Bee: HPC tuning (CPU governor, C-states)
|
||||
After=local-fs.target
|
||||
Before=bee-nvidia.service bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-hpc-tuning.log /usr/local/bin/bee-hpc-tuning
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -10,6 +10,8 @@ RestartSec=3
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
LimitMEMLOCK=infinity
|
||||
# No MemoryMax: bee-web spawns GPU test subprocesses (dcgmproftester etc.)
|
||||
# that legitimately use several GB; a cgroup limit kills them via OOM.
|
||||
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
||||
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
||||
Nice=0
|
||||
|
||||
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
@@ -0,0 +1,110 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=300
|
||||
STAGGER_SECONDS=180
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
resolve_dcgmproftester() {
|
||||
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||
if command -v "${candidate}" >/dev/null 2>&1; then
|
||||
command -v "${candidate}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
echo "loader=dcgmproftester-staggered"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
gpu_pos=0
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_pos=$((gpu_pos + 1))
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||
echo "starting gpu ${id} seconds=${gpu_seconds}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
|
||||
status=0
|
||||
for spec in ${WORKERS}; do
|
||||
pid=${spec%%:*}
|
||||
rest=${spec#*:}
|
||||
id=${rest%%:*}
|
||||
log=${rest#*:}
|
||||
if wait "${pid}"; then
|
||||
echo "gpu ${id} finished: OK"
|
||||
else
|
||||
rc=$?
|
||||
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||
status=1
|
||||
fi
|
||||
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||
done
|
||||
|
||||
exit "${status}"
|
||||
29
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
29
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
@@ -2,13 +2,17 @@
|
||||
set -eu
|
||||
|
||||
SECONDS=5
|
||||
STAGGER_SECONDS=0
|
||||
SIZE_MB=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
PRECISION=""
|
||||
PRECISION_PLAN=""
|
||||
PRECISION_PLAN_SECONDS=""
|
||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -25,9 +29,13 @@ contains_csv() {
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
|
||||
--precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
|
||||
--precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
@@ -61,14 +69,18 @@ done
|
||||
|
||||
echo "loader=bee-gpu-burn"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
gpu_pos=0
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_pos=$((gpu_pos + 1))
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
gpu_size_mb="${SIZE_MB}"
|
||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||
@@ -79,11 +91,22 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_size_mb=512
|
||||
fi
|
||||
fi
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||
precision_arg=""
|
||||
[ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
|
||||
precision_plan_arg=""
|
||||
[ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
|
||||
precision_plan_seconds_arg=""
|
||||
[ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" \
|
||||
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
|
||||
status=0
|
||||
|
||||
41
iso/overlay/usr/local/bin/bee-hpc-tuning
Normal file
41
iso/overlay/usr/local/bin/bee-hpc-tuning
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/bin/sh
|
||||
# bee-hpc-tuning — apply HPC tuning for deterministic benchmarking
|
||||
# Called by bee-hpc-tuning.service at boot.
|
||||
|
||||
log() { echo "[bee-hpc-tuning] $*"; }
|
||||
|
||||
# ── CPU governor ────────────────────────────────────────────────────────────
|
||||
# Set all CPU cores to performance governor via sysfs.
|
||||
# cpupower is not available; write directly to scaling_governor.
|
||||
governor_ok=0
|
||||
governor_fail=0
|
||||
for gov_path in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
|
||||
[ -f "$gov_path" ] || continue
|
||||
if echo performance > "$gov_path" 2>/dev/null; then
|
||||
governor_ok=$((governor_ok + 1))
|
||||
else
|
||||
governor_fail=$((governor_fail + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$governor_ok" -gt 0 ] && [ "$governor_fail" -eq 0 ]; then
|
||||
log "CPU governor set to performance on ${governor_ok} core(s)"
|
||||
elif [ "$governor_ok" -gt 0 ]; then
|
||||
log "WARN: CPU governor: ${governor_ok} OK, ${governor_fail} failed"
|
||||
elif [ "$governor_fail" -gt 0 ]; then
|
||||
log "WARN: failed to set CPU governor on ${governor_fail} core(s)"
|
||||
else
|
||||
log "WARN: no cpufreq scaling_governor paths found (C-state governor or HW-controlled)"
|
||||
fi
|
||||
|
||||
# ── Transparent Huge Pages ───────────────────────────────────────────────────
|
||||
# Kernel cmdline sets transparent_hugepage=always at boot, but confirm and log.
|
||||
thp_path=/sys/kernel/mm/transparent_hugepage/enabled
|
||||
if [ -f "$thp_path" ]; then
|
||||
current=$(cat "$thp_path" 2>/dev/null)
|
||||
log "transparent_hugepage: ${current}"
|
||||
else
|
||||
log "WARN: transparent_hugepage sysfs path not found"
|
||||
fi
|
||||
|
||||
log "done"
|
||||
@@ -65,6 +65,9 @@ done
|
||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
||||
echo " The live medium may have been disconnected." >&2
|
||||
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
||||
echo " Then re-run bee-install." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -162,10 +165,59 @@ log " Mounted."
|
||||
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
||||
log " Source: $SQUASHFS"
|
||||
log " Target: $MOUNT_ROOT"
|
||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||
grep -E '^\[|^inod|^created|^extract' | \
|
||||
while read -r line; do log " $line"; done || true
|
||||
log " Unpack complete."
|
||||
|
||||
# unsquashfs does not support resume, so retry the entire unpack step if the
|
||||
# source medium disappears mid-copy (e.g. CD physically disconnected).
|
||||
UNPACK_ATTEMPTS=0
|
||||
UNPACK_MAX=5
|
||||
while true; do
|
||||
UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
|
||||
if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
|
||||
die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
|
||||
fi
|
||||
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
||||
|
||||
# Re-check squashfs is reachable before each attempt
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
log " SOURCE LOST: $SQUASHFS not found."
|
||||
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
||||
log " then press Enter here to retry."
|
||||
read -r _
|
||||
continue
|
||||
fi
|
||||
|
||||
# wipe partial unpack so unsquashfs starts clean
|
||||
if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
|
||||
log " Cleaning partial unpack from $MOUNT_ROOT ..."
|
||||
# keep the mount point itself but remove its contents
|
||||
find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
|
||||
fi
|
||||
|
||||
UNPACK_OK=0
|
||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||
|
||||
# Check squashfs is still reachable (gone = disc pulled during copy)
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
log " WARNING: source medium lost during unpack — will retry after remount."
|
||||
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
||||
read -r _
|
||||
continue
|
||||
fi
|
||||
|
||||
# Verify the unpack produced a usable root (presence of /etc is a basic check)
|
||||
if [ -d "${MOUNT_ROOT}/etc" ]; then
|
||||
log " Unpack complete."
|
||||
break
|
||||
else
|
||||
log " WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
|
||||
if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
|
||||
log " Retrying in 5 s ..."
|
||||
sleep 5
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
log "--- Step 6/7: Configuring installed system ---"
|
||||
|
||||
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
@@ -2,6 +2,7 @@
|
||||
set -eu
|
||||
|
||||
DURATION_SEC=300
|
||||
STAGGER_SECONDS=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
FORMAT=""
|
||||
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||
@@ -170,6 +172,7 @@ done
|
||||
echo "loader=john"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "john_devices=${JOHN_DEVICES}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
cd "${JOHN_DIR}"
|
||||
|
||||
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
echo "target_seconds=${DURATION_SEC}"
|
||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
||||
TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
_first=1
|
||||
pos=0
|
||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||
pos=$((pos + 1))
|
||||
[ "${_first}" = "1" ] || sleep 3
|
||||
_first=0
|
||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
||||
extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
|
||||
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
|
||||
run_john_loop "${opencl_id}" "${deadline}" &
|
||||
pid=$!
|
||||
PIDS="${PIDS} ${pid}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
FAIL=0
|
||||
for pid in ${PIDS}; do
|
||||
|
||||
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {
|
||||
|
||||
log "kernel: $(uname -r)"
|
||||
|
||||
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
||||
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
|
||||
# Skip if no NVIDIA display/compute GPU is present.
|
||||
# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
|
||||
have_nvidia_gpu() {
|
||||
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||
}
|
||||
|
||||
if ! have_nvidia_gpu; then
|
||||
log "no NVIDIA GPU detected — skipping module load"
|
||||
exit 0
|
||||
fi
|
||||
@@ -253,6 +258,22 @@ else
|
||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||
fi
|
||||
|
||||
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||
# training completes under nvidia-fabricmanager.
|
||||
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager restarted"
|
||||
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager started"
|
||||
else
|
||||
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||
fi
|
||||
else
|
||||
log "WARN: nvidia-fabricmanager.service not installed"
|
||||
fi
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||
|
||||
178
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
178
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
@@ -0,0 +1,178 @@
|
||||
#!/bin/sh
|
||||
# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
|
||||
|
||||
set -u
|
||||
|
||||
log() {
|
||||
echo "[bee-nvidia-recover] $*"
|
||||
}
|
||||
|
||||
log_blocker() {
|
||||
echo "[bee-nvidia-recover] blocker: $*"
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
usage:
|
||||
bee-nvidia-recover restart-drivers
|
||||
bee-nvidia-recover reset-gpu <index>
|
||||
EOF
|
||||
}
|
||||
|
||||
unit_exists() {
|
||||
systemctl cat "$1" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
unit_is_active() {
|
||||
systemctl is-active --quiet "$1" 2>/dev/null
|
||||
}
|
||||
|
||||
stop_unit_if_active() {
|
||||
unit="$1"
|
||||
if unit_is_active "$unit"; then
|
||||
log "stopping $unit"
|
||||
systemctl stop "$unit"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
start_unit_if_marked() {
|
||||
unit="$1"
|
||||
marker="$2"
|
||||
if [ "$marker" = "1" ] && unit_exists "$unit"; then
|
||||
log "starting $unit"
|
||||
systemctl start "$unit"
|
||||
fi
|
||||
}
|
||||
|
||||
wait_for_process_exit() {
|
||||
name="$1"
|
||||
tries=0
|
||||
while pgrep -x "$name" >/dev/null 2>&1; do
|
||||
tries=$((tries + 1))
|
||||
if [ "$tries" -ge 15 ]; then
|
||||
log "WARN: $name is still running after stop request"
|
||||
return 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
kill_pattern() {
|
||||
pattern="$1"
|
||||
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
||||
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "killing processes matching: $pattern"
|
||||
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
||||
sleep 1
|
||||
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
drain_gpu_clients() {
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
for pattern in \
|
||||
"nvidia-smi" \
|
||||
"dcgmi" \
|
||||
"nvvs" \
|
||||
"dcgmproftester" \
|
||||
"all_reduce_perf" \
|
||||
"nvtop" \
|
||||
"bee-gpu-burn" \
|
||||
"bee-john-gpu-stress" \
|
||||
"bee-nccl-gpu-stress" \
|
||||
"Xorg" \
|
||||
"Xwayland"; do
|
||||
kill_pattern "$pattern"
|
||||
done
|
||||
}
|
||||
|
||||
restore_gpu_clients() {
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||
log "enabled NVIDIA persistence mode"
|
||||
else
|
||||
log "WARN: failed to enable NVIDIA persistence mode"
|
||||
fi
|
||||
fi
|
||||
|
||||
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "starting nv-hostengine"
|
||||
nv-hostengine
|
||||
fi
|
||||
|
||||
start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
|
||||
start_unit_if_marked display-manager.service "${display_was_active:-0}"
|
||||
if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
|
||||
start_unit_if_marked lightdm.service "1"
|
||||
fi
|
||||
}
|
||||
|
||||
restart_drivers() {
|
||||
drain_gpu_clients
|
||||
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||
if lsmod | awk '{print $1}' | grep -qx "$mod"; then
|
||||
log "unloading module $mod"
|
||||
rmmod "$mod"
|
||||
fi
|
||||
done
|
||||
rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
|
||||
log "reloading NVIDIA driver stack"
|
||||
/usr/local/bin/bee-nvidia-load
|
||||
restore_gpu_clients
|
||||
}
|
||||
|
||||
reset_gpu() {
|
||||
index="$1"
|
||||
drain_gpu_clients
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
restore_gpu_clients
|
||||
}
|
||||
|
||||
cmd="${1:-}"
|
||||
case "$cmd" in
|
||||
restart-drivers)
|
||||
restart_drivers
|
||||
;;
|
||||
reset-gpu)
|
||||
if [ "$#" -ne 2 ]; then
|
||||
usage >&2
|
||||
exit 2
|
||||
fi
|
||||
reset_gpu "$2"
|
||||
;;
|
||||
*)
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
@@ -9,9 +9,9 @@ xset s noblank
|
||||
|
||||
# Set desktop background.
|
||||
if [ -f /usr/share/bee/wallpaper.png ]; then
|
||||
feh --bg-fill /usr/share/bee/wallpaper.png
|
||||
feh --bg-center --image-bg '#000000' /usr/share/bee/wallpaper.png
|
||||
else
|
||||
xsetroot -solid '#f6c90e'
|
||||
xsetroot -solid '#000000'
|
||||
fi
|
||||
|
||||
tint2 &
|
||||
|
||||
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
||||
#
|
||||
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
||||
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
|
||||
#
|
||||
# Usage: bee-remount-medium [--wait]
|
||||
# --wait keep retrying every 5 seconds until the medium is found (useful
|
||||
# while physically reconnecting the device)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MEDIUM_DIR="/run/live/medium"
|
||||
SQUASHFS_REL="live/filesystem.squashfs"
|
||||
WAIT_MODE=0
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--wait|-w) WAIT_MODE=1 ;;
|
||||
--help|-h)
|
||||
echo "Usage: bee-remount-medium [--wait]"
|
||||
echo " Finds and remounts the live ISO medium to $MEDIUM_DIR"
|
||||
echo " --wait retry every 5 s until a medium with squashfs is found"
|
||||
exit 0 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
die() { log "ERROR: $*" >&2; exit 1; }
|
||||
|
||||
# Return all candidate block devices (optical + removable USB mass storage)
|
||||
find_candidates() {
|
||||
# CD/DVD drives
|
||||
for dev in /dev/sr* /dev/scd*; do
|
||||
[ -b "$dev" ] && echo "$dev"
|
||||
done
|
||||
# USB/removable disks and partitions
|
||||
for dev in /dev/sd* /dev/vd*; do
|
||||
[ -b "$dev" ] || continue
|
||||
# Only whole disks or partitions — skip the same device we are running from
|
||||
local removable
|
||||
local base
|
||||
base=$(basename "$dev")
|
||||
removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
|
||||
[ "$removable" = "1" ] && echo "$dev"
|
||||
done
|
||||
}
|
||||
|
||||
# Try to mount $1 to $MEDIUM_DIR and check for squashfs
|
||||
try_mount() {
|
||||
local dev="$1"
|
||||
local tmpdir
|
||||
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
||||
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
||||
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
|
||||
# Unmount probe mount and mount properly onto live path
|
||||
umount "$tmpdir" 2>/dev/null || true
|
||||
rmdir "$tmpdir" 2>/dev/null || true
|
||||
# Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
|
||||
umount "$MEDIUM_DIR" 2>/dev/null || true
|
||||
mkdir -p "$MEDIUM_DIR"
|
||||
if mount -o ro "$dev" "$MEDIUM_DIR"; then
|
||||
log "Mounted $dev on $MEDIUM_DIR"
|
||||
return 0
|
||||
else
|
||||
log "Mount of $dev on $MEDIUM_DIR failed"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
umount "$tmpdir" 2>/dev/null || true
|
||||
fi
|
||||
rmdir "$tmpdir" 2>/dev/null || true
|
||||
return 1
|
||||
}
|
||||
|
||||
attempt() {
|
||||
log "Scanning for ISO medium..."
|
||||
for dev in $(find_candidates); do
|
||||
log " Trying $dev ..."
|
||||
if try_mount "$dev"; then
|
||||
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
|
||||
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
if [ "$WAIT_MODE" = "1" ]; then
|
||||
log "Waiting for live medium (press Ctrl+C to abort)..."
|
||||
while true; do
|
||||
if attempt; then
|
||||
exit 0
|
||||
fi
|
||||
log " Not found — retrying in 5 s (reconnect the disc now)"
|
||||
sleep 5
|
||||
done
|
||||
else
|
||||
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
|
||||
fi
|
||||
@@ -14,7 +14,7 @@ log() {
|
||||
}
|
||||
|
||||
have_nvidia_gpu() {
|
||||
lspci -nn 2>/dev/null | grep -qi '10de:'
|
||||
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||
}
|
||||
|
||||
service_active() {
|
||||
|
||||
BIN
iso/overlay/usr/share/bee/wallpaper.png
Normal file
BIN
iso/overlay/usr/share/bee/wallpaper.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
64
scripts/deploy.sh
Executable file
64
scripts/deploy.sh
Executable file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
REMOTE_USER="bee"
|
||||
REMOTE_BIN="/usr/local/bin/bee"
|
||||
LOCAL_BIN="audit/bee"
|
||||
SERVICES="bee-audit bee-web"
|
||||
|
||||
# --- IP ---
|
||||
if [[ $# -ge 1 ]]; then
|
||||
HOST="$1"
|
||||
else
|
||||
read -rp "IP адрес хоста: " HOST
|
||||
fi
|
||||
[[ -z "$HOST" ]] && { echo "Ошибка: IP не указан"; exit 1; }
|
||||
|
||||
# --- SSH options ---
|
||||
SSH_OPTS=(-o StrictHostKeyChecking=no -o ConnectTimeout=10)
|
||||
|
||||
# Проверяем, нужен ли пароль
|
||||
SSH_PASS=""
|
||||
if ! ssh "${SSH_OPTS[@]}" -o BatchMode=yes "${REMOTE_USER}@${HOST}" true 2>/dev/null; then
|
||||
if command -v sshpass &>/dev/null; then
|
||||
read -rsp "Пароль для ${REMOTE_USER}@${HOST}: " SSH_PASS
|
||||
echo
|
||||
SSH_CMD=(sshpass -p "$SSH_PASS" ssh "${SSH_OPTS[@]}")
|
||||
SCP_CMD=(sshpass -p "$SSH_PASS" scp "${SSH_OPTS[@]}")
|
||||
else
|
||||
echo "sshpass не установлен. Введите пароль вручную при запросе (или установите SSH-ключ)."
|
||||
SSH_CMD=(ssh "${SSH_OPTS[@]}")
|
||||
SCP_CMD=(scp "${SSH_OPTS[@]}")
|
||||
fi
|
||||
else
|
||||
SSH_CMD=(ssh "${SSH_OPTS[@]}")
|
||||
SCP_CMD=(scp "${SSH_OPTS[@]}")
|
||||
fi
|
||||
|
||||
REMOTE="${REMOTE_USER}@${HOST}"
|
||||
|
||||
# --- Build ---
|
||||
echo "==> Сборка бинарника..."
|
||||
(
|
||||
cd audit
|
||||
VERSION=$(sh ./scripts/resolve-version.sh 2>/dev/null || echo "dev")
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
|
||||
go build -ldflags "-X main.Version=${VERSION}" -o bee ./cmd/bee
|
||||
)
|
||||
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
||||
|
||||
# --- Deploy ---
|
||||
echo "==> Копирование на ${REMOTE}..."
|
||||
"${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
|
||||
|
||||
echo "==> Замена бинарника и перезапуск сервисов..."
|
||||
"${SSH_CMD[@]}" "$REMOTE" bash -s <<EOF
|
||||
set -e
|
||||
sudo mv /tmp/bee-new ${REMOTE_BIN}
|
||||
sudo chmod +x ${REMOTE_BIN}
|
||||
sudo systemctl restart ${SERVICES}
|
||||
sleep 2
|
||||
systemctl status ${SERVICES} --no-pager -l
|
||||
EOF
|
||||
|
||||
echo "==> Готово."
|
||||
Reference in New Issue
Block a user