Compare commits
58 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 51b721aeb3 | |||
| bac89bb6e5 | |||
| 7a618da1f9 | |||
| 64ae1c0ff0 | |||
| 49050ca717 | |||
| 5ba72ab315 | |||
| 63363e9629 | |||
|
|
5285c0d101 | ||
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c | ||
|
|
18e24a9aa5 | ||
|
|
e306250da7 | ||
|
|
c5b2081ac9 | ||
| 434528083e | |||
| 30aa30cd67 | |||
| 4f76e1de21 | |||
| 3732e64a4a | |||
| 0d925299ff | |||
| a8d5e019a5 | |||
| 72ec086568 | |||
| 7a0b0934df | |||
| d8ca0dca2c | |||
| d90250f80a | |||
| 8d6eaef5de | |||
| 732bf4cbab | |||
| fa6d905a10 | |||
|
|
5c1862ce4c | ||
|
|
b65ef2ea1d | ||
|
|
533d703c97 | ||
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d | |||
| a6a07f2626 | |||
| f87461ee4a | |||
| a636146dbd | |||
|
|
303de2df04 | ||
|
|
95124d228f | ||
|
|
54338dbae5 | ||
|
|
2be7ae6d28 | ||
|
|
b1a5035edd | ||
|
|
8fc986c933 | ||
|
|
88b5e0edf2 | ||
|
|
82fe1f6d26 | ||
| 81e7c921f8 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
|
||||
@@ -5,22 +5,18 @@ go 1.25.0
|
||||
replace reanimator/chart => ../internal/chart
|
||||
|
||||
require (
|
||||
github.com/go-analyze/charts v0.5.26
|
||||
modernc.org/sqlite v1.48.0
|
||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/image v0.24.0 // indirect
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
modernc.org/libc v1.70.0 // indirect
|
||||
modernc.org/libc v1.72.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
modernc.org/sqlite v1.48.0 // indirect
|
||||
)
|
||||
|
||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||
|
||||
@@ -30,7 +30,9 @@ var (
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
|
||||
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||
)
|
||||
|
||||
type App struct {
|
||||
@@ -84,6 +86,7 @@ type installer interface {
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
LiveMediaRAMState() platform.LiveMediaRAMState
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
@@ -108,6 +111,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
|
||||
return a.installer.LiveMediaRAMState()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
@@ -117,6 +124,7 @@ type satRunner interface {
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
@@ -138,7 +146,7 @@ type satRunner interface {
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -562,11 +570,18 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBenchmarkBaseDir
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
@@ -729,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
|
||||
@@ -122,11 +122,13 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||
runNCCLFn func(string, []int) (string, error)
|
||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
@@ -154,6 +156,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerBenchFn != nil {
|
||||
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaTargetedStressFn != nil {
|
||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||
@@ -279,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNCCLFn != nil {
|
||||
return f.runNCCLFn(baseDir, gpuIndices)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotBaseDir string
|
||||
var gotGPUIndices []int
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||
gotBaseDir = baseDir
|
||||
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||
return "/tmp/nccl-tests.tar.gz", nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("RunNCCLTests error: %v", err)
|
||||
}
|
||||
if path != "/tmp/nccl-tests.tar.gz" {
|
||||
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||
}
|
||||
if gotBaseDir != "/tmp/sat" {
|
||||
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||
}
|
||||
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
|
||||
var supportBundleCommands = []struct {
|
||||
@@ -48,6 +50,43 @@ else
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi topo -m 2>&1 || true
|
||||
else
|
||||
echo "nvidia-smi not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||
`}},
|
||||
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||
for candidate in \
|
||||
/usr/bin/nvidia-fabricmanager \
|
||||
/usr/bin/nv-fabricmanager \
|
||||
/usr/bin/nvidia-fabricmanagerd \
|
||||
/usr/bin/nvlsm; do
|
||||
if [ -e "$candidate" ]; then
|
||||
echo "=== $candidate ==="
|
||||
ls -l "$candidate" 2>&1 || true
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||
echo "no fabric manager binaries found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -48,7 +48,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||
}
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if result.ScalabilityScore > 0 {
|
||||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||
}
|
||||
if result.PlatformPowerScore > 0 {
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||||
}
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
b.WriteString("\n")
|
||||
|
||||
@@ -81,41 +84,92 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Methodology ───────────────────────────────────────────────────────────
|
||||
b.WriteString("## Methodology\n\n")
|
||||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
|
||||
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
|
||||
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
|
||||
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
|
||||
b.WriteString("**Compute score** is derived from two phases:\n\n")
|
||||
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
|
||||
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
|
||||
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
|
||||
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
|
||||
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
|
||||
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
|
||||
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
|
||||
b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
|
||||
b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
|
||||
b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
|
||||
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||||
b.WriteString("## Balanced Scorecard\n\n")
|
||||
|
||||
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||
b.WriteString("## Scorecard\n\n")
|
||||
b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||
b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||
// Perspective 1: Compatibility — hard stops
|
||||
b.WriteString("### 1. Compatibility\n\n")
|
||||
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
|
||||
b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown GPU"
|
||||
thermalThrottle := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
interconnect := "-"
|
||||
if gpu.Scores.InterconnectScore > 0 {
|
||||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||||
fanAtThrottle := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
ecc := "-"
|
||||
if gpu.ECC.Uncorrected > 0 {
|
||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||
}
|
||||
compatStatus := "✓ OK"
|
||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||
compatStatus = "⛔ HARD STOP"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||||
gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 2: Thermal headroom
|
||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
|
||||
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
switch {
|
||||
case headroom < 10:
|
||||
thermalStatus = "⛔ CRITICAL"
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
thermalStatus = "⚠ WARNING"
|
||||
}
|
||||
throttlePct := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
|
||||
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 3: Power delivery
|
||||
b.WriteString("### 3. Power Delivery\n\n")
|
||||
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
|
||||
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
powerCap := "-"
|
||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||
}
|
||||
fanDuty := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
powerStatus := "✓ OK"
|
||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||
powerStatus = "⚠ POWER LIMITED"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
|
||||
gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 4: Performance
|
||||
b.WriteString("### 4. Performance\n\n")
|
||||
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
|
||||
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
@@ -128,20 +182,41 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||
gpu.Index, name,
|
||||
gpu.Status,
|
||||
gpu.Scores.CompositeScore,
|
||||
gpu.Scores.ComputeScore,
|
||||
synthetic,
|
||||
mixed,
|
||||
mixedEff,
|
||||
topsPerSM,
|
||||
gpu.Scores.PowerSustainScore,
|
||||
gpu.Scores.ThermalSustainScore,
|
||||
gpu.Scores.StabilityScore,
|
||||
interconnect,
|
||||
)
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
|
||||
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
|
||||
}
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 5: Anomaly flags
|
||||
b.WriteString("### 5. Anomalies\n\n")
|
||||
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
|
||||
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
eccCorr := "-"
|
||||
if gpu.ECC.Corrected > 0 {
|
||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||
}
|
||||
syncBoost := "-"
|
||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||
}
|
||||
powerVar := "OK"
|
||||
if gpu.Scores.PowerSustainScore < 70 {
|
||||
powerVar = "⚠ unstable"
|
||||
}
|
||||
thermalVar := "OK"
|
||||
if gpu.Scores.ThermalSustainScore < 70 {
|
||||
thermalVar = "⚠ unstable"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||||
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
@@ -170,25 +245,39 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if gpu.PowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||
}
|
||||
if gpu.PowerLimitDerated {
|
||||
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||||
}
|
||||
if gpu.CalibratedPeakPowerW > 0 {
|
||||
if gpu.CalibratedPeakTempC > 0 {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||||
} else {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||||
}
|
||||
}
|
||||
if gpu.LockedGraphicsClockMHz > 0 {
|
||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Steady-state telemetry
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||
b.WriteString("\n")
|
||||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||
}
|
||||
|
||||
// Per-precision stability phases.
|
||||
if len(gpu.PrecisionSteady) > 0 {
|
||||
b.WriteString("**Per-precision stability:**\n\n")
|
||||
b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
|
||||
b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
|
||||
for _, p := range gpu.PrecisionSteady {
|
||||
eccCorr := "—"
|
||||
eccUncorr := "—"
|
||||
@@ -196,8 +285,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
||||
p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
||||
status := p.Status
|
||||
if strings.TrimSpace(status) == "" {
|
||||
status = "OK"
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
||||
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
||||
eccCorr, eccUncorr)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
@@ -290,6 +383,44 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||
if cooling := result.Cooling; cooling != nil {
|
||||
b.WriteString("## Cooling\n\n")
|
||||
if cooling.Available {
|
||||
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
||||
if cooling.FanDutyCycleAvailable {
|
||||
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
||||
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
||||
} else {
|
||||
b.WriteString("| Average fan duty cycle | N/A |\n")
|
||||
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
||||
}
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||
}
|
||||
for _, note := range cooling.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(cooling.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Platform Scalability ──────────────────────────────────────────────────
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||
b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
|
||||
b.WriteString("|--------|-------------|----------------------|-------------|\n")
|
||||
for _, step := range result.PerformanceRampSteps {
|
||||
fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
|
||||
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||
b.WriteString("## Raw Files\n\n")
|
||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||
@@ -339,6 +470,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||
|
||||
@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
{
|
||||
name: "default",
|
||||
profile: "",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "stability",
|
||||
profile: "stability",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "overnight",
|
||||
profile: "overnight",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -41,6 +41,129 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if len(labels) != 5 || len(phases) != 5 {
|
||||
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
|
||||
}
|
||||
if basePhaseSec != 60 {
|
||||
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 300 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
|
||||
}
|
||||
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 300 {
|
||||
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 3600 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 3600 {
|
||||
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 14400 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
phases := []benchmarkPlannedPhase{
|
||||
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
|
||||
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
|
||||
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
|
||||
}
|
||||
rows := []GPUMetricRow{
|
||||
{ElapsedSec: 5},
|
||||
{ElapsedSec: 15},
|
||||
{ElapsedSec: 25},
|
||||
{ElapsedSec: 65},
|
||||
}
|
||||
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
|
||||
if len(got["fp8"]) != 1 {
|
||||
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
|
||||
}
|
||||
if len(got["fp16"]) != 1 {
|
||||
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
|
||||
}
|
||||
if len(got["mixed"]) != 2 {
|
||||
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
wantStatus string
|
||||
}{
|
||||
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
|
||||
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
|
||||
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
|
||||
if got != tc.wantStatus {
|
||||
t.Fatalf("status=%q want %q", got, tc.wantStatus)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -65,8 +188,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
"[gpu 0] compute_capability=9.0",
|
||||
"[gpu 0] backend=cublasLt",
|
||||
"[gpu 0] duration_s=10",
|
||||
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
|
||||
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||
"[gpu 0] int8_tensor_iterations=80",
|
||||
"[gpu 0] fp16_tensor_iterations=200",
|
||||
"[gpu 0] fp8_e4m3_iterations=50",
|
||||
"[gpu 0] status=OK",
|
||||
@@ -79,15 +204,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
if got.ComputeCapability != "9.0" {
|
||||
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||
}
|
||||
if len(got.Profiles) != 2 {
|
||||
t.Fatalf("profiles=%d want 2", len(got.Profiles))
|
||||
if len(got.Profiles) != 3 {
|
||||
t.Fatalf("profiles=%d want 3", len(got.Profiles))
|
||||
}
|
||||
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||
}
|
||||
if got.Profiles[0].Category != "fp16_bf16" {
|
||||
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
|
||||
}
|
||||
if got.Profiles[1].Category != "fp8" {
|
||||
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||
}
|
||||
if got.Profiles[2].Category != "int8" {
|
||||
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
|
||||
}
|
||||
if got.Profiles[2].Weight != 0.25 {
|
||||
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
@@ -131,6 +265,13 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
DegradationReasons: []string{"power_capped"},
|
||||
},
|
||||
},
|
||||
Cooling: &BenchmarkCoolingSummary{
|
||||
Available: true,
|
||||
AvgFanRPM: 9200,
|
||||
FanDutyCycleAvailable: true,
|
||||
AvgFanDutyCyclePct: 47.5,
|
||||
P95FanDutyCyclePct: 62.0,
|
||||
},
|
||||
}
|
||||
|
||||
report := renderBenchmarkReport(result)
|
||||
@@ -140,6 +281,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
"1176.00",
|
||||
"fp16_tensor",
|
||||
"700.00",
|
||||
"Cooling",
|
||||
"Average fan duty cycle",
|
||||
"47.5%",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
@@ -170,6 +314,30 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
|
||||
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
|
||||
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
|
||||
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
|
||||
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
PrecisionResults: []BenchmarkPrecisionResult{
|
||||
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
|
||||
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
})
|
||||
|
||||
if score.SyntheticScore != 100 {
|
||||
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
|
||||
}
|
||||
if score.MixedScore != 50 {
|
||||
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -25,12 +25,49 @@ type BenchmarkCPULoad struct {
|
||||
Note string `json:"note,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCoolingSummary captures fan telemetry averaged across the full
|
||||
// benchmark run.
|
||||
type BenchmarkCoolingSummary struct {
|
||||
Available bool `json:"available"`
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaBenchmarkProfileStandard = "standard"
|
||||
NvidiaBenchmarkProfileStability = "stability"
|
||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||
// re-measure from actual task logs and update the constants here.
|
||||
//
|
||||
// Sources:
|
||||
// - BenchmarkEstimatedPerfStandardSec: MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
|
||||
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||
// - BenchmarkEstimatedPowerStabilitySec: xFusion v8.17/v8.22 ramp 1-8: 1977-2002 s
|
||||
const (
|
||||
// Performance Benchmark (bee-gpu-burn).
|
||||
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||
// Sequential per-GPU mode scales approximately linearly.
|
||||
BenchmarkEstimatedPerfStandardSec = 960 // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
|
||||
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||
|
||||
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||
BenchmarkEstimatedPowerStabilitySec = 2000 // ~33 min; stability profile converges faster (longer steady → faster convergence)
|
||||
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||
)
|
||||
|
||||
type NvidiaBenchmarkOptions struct {
|
||||
Profile string
|
||||
SizeMB int
|
||||
@@ -44,26 +81,32 @@ type NvidiaBenchmarkOptions struct {
|
||||
}
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
@@ -92,13 +135,22 @@ type BenchmarkGPUResult struct {
|
||||
Backend string `json:"backend,omitempty"`
|
||||
Status string `json:"status"`
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
|
||||
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
|
||||
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
|
||||
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
|
||||
// Fallback: 80°C.
|
||||
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
|
||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||
// dcgmi targeted_power calibration run before the main benchmark.
|
||||
// Used as the reference denominator for PowerSustainScore instead of
|
||||
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
|
||||
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
@@ -107,6 +159,7 @@ type BenchmarkGPUResult struct {
|
||||
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
|
||||
PrecisionFailures []string `json:"precision_failures,omitempty"`
|
||||
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||
// ECC error delta accumulated over the full benchmark (all phases combined).
|
||||
@@ -115,6 +168,9 @@ type BenchmarkGPUResult struct {
|
||||
Scores BenchmarkScorecard `json:"scores"`
|
||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning is non-empty when a thermal throttle event occurred with
|
||||
// a clock drop ≥20% while server fans were not at 100% duty cycle.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkTelemetrySummary struct {
|
||||
@@ -167,7 +223,7 @@ type BenchmarkPrecisionResult struct {
|
||||
Iterations uint64 `json:"iterations,omitempty"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
// Weight is the fp32-equivalence factor for this precision category.
|
||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
|
||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
|
||||
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||
Weight float64 `json:"weight,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
@@ -187,9 +243,30 @@ type BenchmarkScorecard struct {
|
||||
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// StabilityScore: fraction of steady-state time the GPU spent throttling
|
||||
// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
|
||||
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||
|
||||
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||
// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
|
||||
// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
|
||||
TempHeadroomC float64 `json:"temp_headroom_c"`
|
||||
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
// ServerQualityScore (0–100) reflects server infrastructure quality independent
|
||||
// of GPU model. Combines throttle time, power variance, and temp variance.
|
||||
// Use this to compare servers with the same GPU, or to flag a bad server
|
||||
// that throttles an otherwise fast GPU.
|
||||
ServerQualityScore float64 `json:"server_quality_score"`
|
||||
// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
|
||||
// A throttling GPU will score lower here automatically — no quality multiplier.
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||
}
|
||||
@@ -213,13 +290,15 @@ type BenchmarkServerPower struct {
|
||||
// type runs at a time the PowerCVPct here is a genuine stability signal.
|
||||
type BenchmarkPrecisionSteadyPhase struct {
|
||||
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
|
||||
Status string `json:"status,omitempty"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
// ECC errors accumulated during this precision phase only.
|
||||
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
|
||||
// Any uncorrected = serious fault triggered by this precision workload.
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkInterconnectResult struct {
|
||||
@@ -233,3 +312,89 @@ type BenchmarkInterconnectResult struct {
|
||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||
// this server under full GPU load. Use for rack power planning.
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||
// actual wall-power draw as seen by the server's power supply.
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchGPU struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||
// stably with all other GPUs running simultaneously at their own limits.
|
||||
// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
|
||||
// additional derating.
|
||||
StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"`
|
||||
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// Telemetry holds the aggregated stats from the final converged calibration
|
||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||
NewGPUIndex int `json:"new_gpu_index"`
|
||||
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
}
|
||||
|
||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||
// scalability ramp-up phase of the performance benchmark.
|
||||
type NvidiaPerformanceRampStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||
ScalabilityPct float64 `json:"scalability_pct"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
@@ -13,15 +13,21 @@ import (
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
Stage string `json:"stage,omitempty"`
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||
Stage string `json:"stage,omitempty"`
|
||||
StageStartSec float64 `json:"stage_start_sec,omitempty"`
|
||||
StageEndSec float64 `json:"stage_end_sec,omitempty"`
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
@@ -142,10 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||
for _, r := range rows {
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
|
||||
dutyAvail := 0
|
||||
if r.FanDutyCycleAvailable {
|
||||
dutyAvail = 1
|
||||
}
|
||||
dutyEstimated := 0
|
||||
if r.FanDutyCycleEstimated {
|
||||
dutyEstimated = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
@@ -502,11 +516,22 @@ func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
|
||||
if name == "" {
|
||||
name = "run"
|
||||
}
|
||||
start := row.StageStartSec
|
||||
end := row.StageEndSec
|
||||
if end <= start {
|
||||
start = row.ElapsedSec
|
||||
end = row.ElapsedSec
|
||||
}
|
||||
if len(spans) == 0 || spans[len(spans)-1].Name != name {
|
||||
spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
|
||||
spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
|
||||
continue
|
||||
}
|
||||
spans[len(spans)-1].End = row.ElapsedSec
|
||||
if start < spans[len(spans)-1].Start {
|
||||
spans[len(spans)-1].Start = start
|
||||
}
|
||||
if end > spans[len(spans)-1].End {
|
||||
spans[len(spans)-1].End = end
|
||||
}
|
||||
}
|
||||
for i := range spans {
|
||||
if spans[i].End <= spans[i].Start {
|
||||
|
||||
@@ -11,20 +11,10 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const installToRAMDir = "/dev/shm/bee-live"
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
if fsType == "" {
|
||||
// No medium mount at all — fall back to toram kernel parameter.
|
||||
return toramActive()
|
||||
}
|
||||
if strings.EqualFold(fsType, "tmpfs") {
|
||||
return true
|
||||
}
|
||||
// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
|
||||
// mount of /run/live/medium fails (common for CD-ROM boots), the medium
|
||||
// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
|
||||
files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
|
||||
return len(files) > 0
|
||||
return s.LiveMediaRAMState().InRAM
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
@@ -56,42 +46,164 @@ func (s *System) LiveBootSource() LiveBootSource {
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
|
||||
return evaluateLiveMediaRAMState(
|
||||
s.LiveBootSource(),
|
||||
toramActive(),
|
||||
globPaths("/run/live/medium/live/*.squashfs"),
|
||||
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
|
||||
)
|
||||
}
|
||||
|
||||
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
|
||||
state := LiveMediaRAMState{
|
||||
LiveBootSource: status,
|
||||
ToramActive: toram,
|
||||
CopyPresent: len(copiedSquashfs) > 0,
|
||||
}
|
||||
if status.InRAM {
|
||||
state.State = "in_ram"
|
||||
state.Status = "ok"
|
||||
state.CopyComplete = true
|
||||
state.Message = "Running from RAM — installation media can be safely disconnected."
|
||||
return state
|
||||
}
|
||||
|
||||
expected := pathBaseSet(sourceSquashfs)
|
||||
copied := pathBaseSet(copiedSquashfs)
|
||||
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
|
||||
|
||||
switch {
|
||||
case state.CopyComplete:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
|
||||
case state.CopyPresent:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
|
||||
case toram:
|
||||
state.State = "toram_failed"
|
||||
state.Status = "failed"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
|
||||
default:
|
||||
state.State = "not_in_ram"
|
||||
state.Status = "warning"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
|
||||
}
|
||||
return state
|
||||
}
|
||||
|
||||
func globPaths(pattern string) []string {
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
return matches
|
||||
}
|
||||
|
||||
func pathBaseSet(paths []string) map[string]struct{} {
|
||||
out := make(map[string]struct{}, len(paths))
|
||||
for _, path := range paths {
|
||||
base := strings.TrimSpace(filepath.Base(path))
|
||||
if base != "" {
|
||||
out[base] = struct{}{}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func setContainsAll(have, want map[string]struct{}) bool {
|
||||
if len(want) == 0 {
|
||||
return false
|
||||
}
|
||||
for name := range want {
|
||||
if _, ok := have[name]; !ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
|
||||
log := func(msg string) {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
}
|
||||
|
||||
if s.IsLiveMediaInRAM() {
|
||||
state := s.LiveMediaRAMState()
|
||||
if state.InRAM {
|
||||
log("Already running from RAM — installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
if err != nil || len(squashfsFiles) == 0 {
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||
}
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
dstDir := installToRAMDir
|
||||
|
||||
// If the source medium is unavailable, check whether a previous run already
|
||||
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||
// directly to the loop-rebind / bind-mount steps.
|
||||
if !sourceAvailable {
|
||||
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(copiedFiles) > 0 {
|
||||
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||
// Proceed to rebind with the already-copied files.
|
||||
for _, dst := range copiedFiles {
|
||||
base := filepath.Base(dst)
|
||||
// Re-associate the loop device that was originally backed by the
|
||||
// source file (now gone); find it by the old source path pattern.
|
||||
srcGuess := "/run/live/medium/live/" + base
|
||||
loopDev, lerr := findLoopForFile(srcGuess)
|
||||
if lerr != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||
continue
|
||||
}
|
||||
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
goto bindMedium
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||
}
|
||||
|
||||
dstDir := "/dev/shm/bee-live"
|
||||
{
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
}
|
||||
|
||||
if state.CopyPresent {
|
||||
log("Removing stale partial RAM copy before retry...")
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
return
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
log("Removed incomplete RAM copy.")
|
||||
}()
|
||||
|
||||
for _, sf := range squashfsFiles {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -117,6 +229,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
}
|
||||
}
|
||||
|
||||
bindMedium:
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
|
||||
@@ -58,3 +58,46 @@ func TestDescribeLiveBootSource(t *testing.T) {
|
||||
t.Fatalf("got %q want /run/live/medium", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("in_ram", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
|
||||
false,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("partial_copy_after_cancel", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
false,
|
||||
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
|
||||
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
|
||||
)
|
||||
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
if state.CopyComplete {
|
||||
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("toram_failed", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
true,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -18,11 +18,19 @@ type LiveMetricSample struct {
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
PSUs []PSUReading `json:"psus,omitempty"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// PSUReading is a per-slot power supply input power reading.
|
||||
type PSUReading struct {
|
||||
Slot int `json:"slot"`
|
||||
Name string `json:"name"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
}
|
||||
|
||||
// TempReading is a named temperature sensor value.
|
||||
type TempReading struct {
|
||||
Name string `json:"name"`
|
||||
@@ -57,6 +65,9 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
|
||||
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||
s.PSUs = samplePSUPower()
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
|
||||
@@ -326,3 +337,65 @@ func compactAmbientTempName(chip, name string) string {
|
||||
}
|
||||
return chip + " / " + name
|
||||
}
|
||||
|
||||
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
|
||||
// sensors (entity ID "10.N") that report a value in Watts.
|
||||
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||
func samplePSUPower() []PSUReading {
|
||||
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
// map slot → reading (keep highest-watt value per slot in case of duplicates)
|
||||
type entry struct {
|
||||
name string
|
||||
powerW float64
|
||||
}
|
||||
bySlot := map[int]entry{}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
|
||||
if !strings.HasPrefix(entityID, "10.") {
|
||||
continue // not a Power Supply entity
|
||||
}
|
||||
slotStr := strings.TrimPrefix(entityID, "10.")
|
||||
slot, err := strconv.Atoi(slotStr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
|
||||
if !strings.Contains(strings.ToLower(valueField), "watts") {
|
||||
continue
|
||||
}
|
||||
valueFields := strings.Fields(valueField)
|
||||
if len(valueFields) < 2 {
|
||||
continue
|
||||
}
|
||||
w, err := strconv.ParseFloat(valueFields[0], 64)
|
||||
if err != nil || w <= 0 {
|
||||
continue
|
||||
}
|
||||
sensorName := strings.TrimSpace(parts[0])
|
||||
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
|
||||
bySlot[slot] = entry{name: sensorName, powerW: w}
|
||||
}
|
||||
}
|
||||
if len(bySlot) == 0 {
|
||||
return nil
|
||||
}
|
||||
slots := make([]int, 0, len(bySlot))
|
||||
for s := range bySlot {
|
||||
slots = append(slots, s)
|
||||
}
|
||||
sort.Ints(slots)
|
||||
psus := make([]PSUReading, 0, len(slots))
|
||||
for _, s := range slots {
|
||||
e := bySlot[s]
|
||||
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
|
||||
}
|
||||
return psus
|
||||
}
|
||||
|
||||
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
|
||||
"bee-audit",
|
||||
"bee-web",
|
||||
"bee-sshsetup",
|
||||
"nvidia-dcgm",
|
||||
"nvidia-fabricmanager",
|
||||
}
|
||||
|
||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
@@ -171,25 +173,28 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||
return ToolStatus{Name: display}
|
||||
}
|
||||
|
||||
// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
|
||||
// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
|
||||
// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
|
||||
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
|
||||
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
|
||||
// incomplete RAM copy exists but runtime still depends on the boot medium,
|
||||
// "failed" = toram was requested but medium is not in RAM.
|
||||
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||
inRAM := s.IsLiveMediaInRAM()
|
||||
active := toramActive()
|
||||
switch {
|
||||
case inRAM:
|
||||
health.ToRAMStatus = "ok"
|
||||
case active:
|
||||
// toram was requested but medium is not yet/no longer in RAM
|
||||
health.ToRAMStatus = "failed"
|
||||
state := s.LiveMediaRAMState()
|
||||
health.ToRAMStatus = state.Status
|
||||
switch state.Status {
|
||||
case "ok":
|
||||
return
|
||||
case "failed":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_failed",
|
||||
Severity: "warning",
|
||||
Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
|
||||
Description: state.Message,
|
||||
})
|
||||
case "partial":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_partial",
|
||||
Severity: "warning",
|
||||
Description: state.Message,
|
||||
})
|
||||
default:
|
||||
health.ToRAMStatus = "warning"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,13 +216,13 @@ func findUSBExportMount() string {
|
||||
|
||||
// fs types that are expected on USB export drives
|
||||
exportFSTypes := map[string]bool{
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"fuseblk": true,
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,54 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||
// production logs in _benchmark/_v8/.
|
||||
//
|
||||
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||
// from actual task logs and update the matching constant here.
|
||||
//
|
||||
// Sources:
|
||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||
const (
|
||||
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||
SATEstimatedCPUValidateSec = 65
|
||||
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||
SATEstimatedCPUStressSec = 1800
|
||||
|
||||
// RAM: memtester 256 MB / 1 pass.
|
||||
SATEstimatedMemoryValidateSec = 70
|
||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||
SATEstimatedMemoryStressSec = 140
|
||||
|
||||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||||
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||||
|
||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||
SATEstimatedNvidiaPulseTestSec = 5000
|
||||
|
||||
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaInterconnectSec = 300
|
||||
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||
SATEstimatedNvidiaBandwidthSec = 2700
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
@@ -366,12 +414,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
gpuCount := len(selected)
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
@@ -380,7 +430,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
@@ -426,6 +476,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -443,6 +500,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -460,6 +524,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -552,9 +623,19 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
|
||||
// intentionally conservative enough for healthy systems while avoiding the
|
||||
// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
|
||||
timeoutSec := sizeMB*passes*20/100 + 60
|
||||
if timeoutSec < 180 {
|
||||
timeoutSec = 180
|
||||
}
|
||||
if timeoutSec > 900 {
|
||||
timeoutSec = 900
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type fanObservationState struct {
|
||||
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||
}
|
||||
|
||||
type fanPeakCandidate struct {
|
||||
FirstSeen time.Time
|
||||
RPM float64
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
fanObservationMu sync.Mutex
|
||||
fanObservation fanObservationState
|
||||
fanObservationInit bool
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||
|
||||
const fanObservationMinPeakHold = time.Second
|
||||
|
||||
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||
if rpm <= 0 {
|
||||
return 0
|
||||
}
|
||||
return math.Ceil(rpm/1000.0) * 1000.0
|
||||
}
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err == nil {
|
||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
}
|
||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||
if len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
if err != nil {
|
||||
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
return nil, sensorsErr
|
||||
}
|
||||
|
||||
func loadFanObservationLocked() {
|
||||
if fanObservationInit {
|
||||
return
|
||||
}
|
||||
fanObservationInit = true
|
||||
fanObservation.MaxRPM = make(map[string]float64)
|
||||
raw, err := os.ReadFile(fanObservationStatePath)
|
||||
if err != nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
var persisted fanObservationState
|
||||
if json.Unmarshal(raw, &persisted) != nil {
|
||||
return
|
||||
}
|
||||
for name, rpm := range persisted.MaxRPM {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
fanObservation.MaxRPM[name] = rpm
|
||||
}
|
||||
}
|
||||
|
||||
func saveFanObservationLocked() {
|
||||
if len(fanObservation.MaxRPM) == 0 {
|
||||
return
|
||||
}
|
||||
dir := filepath.Dir(fanObservationStatePath)
|
||||
if dir == "" || dir == "." {
|
||||
dir = "/var/log/bee-sat"
|
||||
}
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return
|
||||
}
|
||||
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||
}
|
||||
|
||||
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||
if len(fans) == 0 {
|
||||
return
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
changed := false
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
currentMax := fanObservation.MaxRPM[name]
|
||||
if fan.RPM <= currentMax {
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if cand, ok := fanPeakCandidates[name]; ok {
|
||||
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||
newMax := math.Max(cand.RPM, fan.RPM)
|
||||
if newMax > currentMax {
|
||||
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||
changed = true
|
||||
}
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if fan.RPM > cand.RPM {
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||
}
|
||||
continue
|
||||
}
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||
}
|
||||
if changed {
|
||||
saveFanObservationLocked()
|
||||
}
|
||||
}
|
||||
|
||||
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
var samples []float64
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
maxRPM := fanObservation.MaxRPM[name]
|
||||
if maxRPM <= 0 {
|
||||
continue
|
||||
}
|
||||
pct := fan.RPM / maxRPM * 100.0
|
||||
if pct > 100 {
|
||||
pct = 100
|
||||
}
|
||||
if pct < 0 {
|
||||
pct = 0
|
||||
}
|
||||
samples = append(samples, pct)
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
//
|
||||
@@ -426,6 +566,116 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
return fans, nil
|
||||
}
|
||||
|
||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||
// Returns the average duty cycle across all exposed PWM controls.
|
||||
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
fans, fanErr := sampleFanSpeeds()
|
||||
if fanErr != nil {
|
||||
return 0, false, false
|
||||
}
|
||||
return sampleFanDutyCyclePctFromFans(fans)
|
||||
}
|
||||
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||
return pct, ok, false
|
||||
}
|
||||
|
||||
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false, false
|
||||
}
|
||||
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||
return pct, true, true
|
||||
}
|
||||
return 0, false, false
|
||||
}
|
||||
|
||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
return 0, false
|
||||
}
|
||||
var samples []float64
|
||||
for _, features := range doc {
|
||||
for name, feature := range features {
|
||||
if strings.EqualFold(name, "Adapter") {
|
||||
continue
|
||||
}
|
||||
featureMap, ok := feature.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if duty, ok := firstFanDutyValue(name, featureMap); ok {
|
||||
samples = append(samples, duty)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
|
||||
featureName = strings.ToLower(strings.TrimSpace(featureName))
|
||||
if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
|
||||
return 0, false
|
||||
}
|
||||
if strings.Contains(featureName, "pwm") {
|
||||
for _, key := range []string{"input", "value", "current"} {
|
||||
if value, ok := feature[key]; ok {
|
||||
if duty, parsed := parseFanDutyValue(value); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "pwm") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
|
||||
continue
|
||||
}
|
||||
if duty, parsed := parseFanDutyValue(feature[key]); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func parseFanDutyValue(value any) (float64, bool) {
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return normalizePWMAsDutyPct(v)
|
||||
case string:
|
||||
if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
|
||||
return normalizePWMAsDutyPct(f)
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func normalizePWMAsDutyPct(raw float64) (float64, bool) {
|
||||
if raw < 0 {
|
||||
return 0, false
|
||||
}
|
||||
if raw <= 100 {
|
||||
return raw, true
|
||||
}
|
||||
if raw <= 255 {
|
||||
return raw / 255.0 * 100.0, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -29,6 +30,74 @@ func TestFirstFanInputValue(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||
raw := []byte(`{
|
||||
"chip0": {
|
||||
"fan1": {"input": 9000},
|
||||
"pwm1": {"input": 128},
|
||||
"pwm1_enable": {"input": 1}
|
||||
},
|
||||
"chip1": {
|
||||
"pwm2": {"input": 64}
|
||||
}
|
||||
}`)
|
||||
|
||||
got, ok := parseFanDutyCyclePctSensorsJSON(raw)
|
||||
if !ok {
|
||||
t.Fatalf("expected duty cycle telemetry to be parsed")
|
||||
}
|
||||
if got < 57 || got > 58 {
|
||||
t.Fatalf("got=%v want ~57.1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldPath := fanObservationStatePath
|
||||
oldState := fanObservation
|
||||
oldInit := fanObservationInit
|
||||
oldCandidates := fanPeakCandidates
|
||||
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
t.Cleanup(func() {
|
||||
fanObservationStatePath = oldPath
|
||||
fanObservation = oldState
|
||||
fanObservationInit = oldInit
|
||||
fanPeakCandidates = oldCandidates
|
||||
})
|
||||
|
||||
start := time.Unix(100, 0)
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||
t.Fatalf("single-sample spike should not establish observed max")
|
||||
}
|
||||
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||
|
||||
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("got=%v want ~43.3", got)
|
||||
}
|
||||
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
|
||||
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 2 {
|
||||
|
||||
@@ -9,6 +9,17 @@ type LiveBootSource struct {
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type LiveMediaRAMState struct {
|
||||
LiveBootSource
|
||||
State string `json:"state"`
|
||||
Status string `json:"status"`
|
||||
ToramActive bool `json:"toram_active,omitempty"`
|
||||
CopyPresent bool `json:"copy_present,omitempty"`
|
||||
CopyComplete bool `json:"copy_complete,omitempty"`
|
||||
CanStartCopy bool `json:"can_start_copy,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
|
||||
@@ -15,17 +15,17 @@ type HardwareIngestRequest struct {
|
||||
}
|
||||
|
||||
type RuntimeHealth struct {
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||
|
||||
@@ -36,6 +36,16 @@ var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, err
|
||||
return a.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
const (
|
||||
taskPriorityBenchmark = 10
|
||||
taskPriorityBurn = 20
|
||||
taskPriorityValidateStress = 30
|
||||
taskPriorityValidate = 40
|
||||
taskPriorityAudit = 50
|
||||
taskPriorityInstallToRAM = 60
|
||||
taskPriorityInstall = 70
|
||||
)
|
||||
|
||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||
|
||||
var jobCounter atomic.Uint64
|
||||
@@ -100,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||
|
||||
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
|
||||
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||
"nvidia-bandwidth", "nvidia-stress":
|
||||
return true
|
||||
@@ -109,6 +119,30 @@ func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func defaultTaskPriority(target string, params taskParams) int {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "install":
|
||||
return taskPriorityInstall
|
||||
case "install-to-ram":
|
||||
return taskPriorityInstallToRAM
|
||||
case "audit":
|
||||
return taskPriorityAudit
|
||||
case "nvidia-bench-perf", "nvidia-bench-power":
|
||||
return taskPriorityBenchmark
|
||||
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
||||
return taskPriorityBurn
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
|
||||
"amd", "amd-mem", "amd-bandwidth":
|
||||
if params.StressMode {
|
||||
return taskPriorityValidateStress
|
||||
}
|
||||
return taskPriorityValidate
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||
if len(gpus) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||
@@ -458,6 +492,7 @@ func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) {
|
||||
ID: newJobID("audit"),
|
||||
Name: "Audit",
|
||||
Target: "audit",
|
||||
Priority: defaultTaskPriority("audit", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
@@ -491,14 +526,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||
ParallelGPUs bool `json:"parallel_gpus"`
|
||||
Loader string `json:"loader"`
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||
ParallelGPUs bool `json:"parallel_gpus"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
PlatformComponents []string `json:"platform_components"`
|
||||
@@ -514,19 +549,147 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
StaggerGPUStart: body.StaggerGPUStart,
|
||||
ParallelGPUs: body.ParallelGPUs,
|
||||
Loader: body.Loader,
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
StaggerGPUStart: body.StaggerGPUStart,
|
||||
ParallelGPUs: body.ParallelGPUs,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||
RampUp *bool `json:"ramp_up"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
runNCCL := true
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
parallelGPUs := false
|
||||
if body.ParallelGPUs != nil {
|
||||
parallelGPUs = *body.ParallelGPUs
|
||||
}
|
||||
rampUp := false
|
||||
if body.RampUp != nil {
|
||||
rampUp = *body.RampUp
|
||||
}
|
||||
// Build a descriptive base name that includes profile and mode so the task
|
||||
// list is self-explanatory without opening individual task detail pages.
|
||||
profile := strings.TrimSpace(body.Profile)
|
||||
if profile == "" {
|
||||
profile = "standard"
|
||||
}
|
||||
name := taskDisplayName(target, "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
// Append profile tag.
|
||||
name = fmt.Sprintf("%s · %s", name, profile)
|
||||
|
||||
if target == "nvidia-bench-power" && parallelGPUs {
|
||||
writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
|
||||
return
|
||||
}
|
||||
|
||||
if rampUp && len(body.GPUIndices) > 1 {
|
||||
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
|
||||
// in Phase 2 (one additional GPU per step). A single task with all
|
||||
// selected GPUs is sufficient — spawning N tasks with growing subsets
|
||||
// would repeat all earlier steps redundantly.
|
||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
if len(resolved) < 2 {
|
||||
// Fall through to normal single-task path.
|
||||
rampUp = false
|
||||
} else {
|
||||
now := time.Now()
|
||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||
taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
|
||||
t := &Task{
|
||||
ID: newJobID("bee-bench-nvidia"),
|
||||
Name: taskName,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
GPUIndices: append([]int(nil), resolved...),
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: true,
|
||||
RampTotal: len(resolved),
|
||||
RampRunID: rampRunID,
|
||||
DisplayName: taskName,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeTaskRunResponse(w, []*Task{t})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// For non-ramp tasks append mode tag.
|
||||
if parallelGPUs {
|
||||
name = fmt.Sprintf("%s · parallel", name)
|
||||
} else {
|
||||
name = fmt.Sprintf("%s · sequential", name)
|
||||
}
|
||||
|
||||
params := taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: parallelGPUs,
|
||||
DisplayName: body.DisplayName,
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
@@ -539,129 +702,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||
RampUp *bool `json:"ramp_up"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
runNCCL := true
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
parallelGPUs := false
|
||||
if body.ParallelGPUs != nil {
|
||||
parallelGPUs = *body.ParallelGPUs
|
||||
}
|
||||
rampUp := false
|
||||
if body.RampUp != nil {
|
||||
rampUp = *body.RampUp
|
||||
}
|
||||
// Build a descriptive base name that includes profile and mode so the task
|
||||
// list is self-explanatory without opening individual task detail pages.
|
||||
profile := strings.TrimSpace(body.Profile)
|
||||
if profile == "" {
|
||||
profile = "standard"
|
||||
}
|
||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
// Append profile tag.
|
||||
name = fmt.Sprintf("%s · %s", name, profile)
|
||||
|
||||
if rampUp && len(body.GPUIndices) > 1 {
|
||||
// Ramp-up mode: resolve GPU list, then create one task per prefix
|
||||
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
|
||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
if len(resolved) < 2 {
|
||||
// Fall through to normal single-task path.
|
||||
rampUp = false
|
||||
} else {
|
||||
now := time.Now()
|
||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||
var allTasks []*Task
|
||||
for step := 1; step <= len(resolved); step++ {
|
||||
subset := resolved[:step]
|
||||
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
|
||||
t := &Task{
|
||||
ID: newJobID("benchmark-nvidia"),
|
||||
Name: stepName,
|
||||
Target: "nvidia-benchmark",
|
||||
Priority: 15,
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
GPUIndices: append([]int(nil), subset...),
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL && step == len(resolved),
|
||||
ParallelGPUs: true,
|
||||
RampStep: step,
|
||||
RampTotal: len(resolved),
|
||||
RampRunID: rampRunID,
|
||||
DisplayName: stepName,
|
||||
},
|
||||
}
|
||||
allTasks = append(allTasks, t)
|
||||
}
|
||||
for _, t := range allTasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, allTasks)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// For non-ramp tasks append mode tag.
|
||||
if parallelGPUs {
|
||||
name = fmt.Sprintf("%s · parallel", name)
|
||||
} else {
|
||||
name = fmt.Sprintf("%s · sequential", name)
|
||||
}
|
||||
|
||||
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: parallelGPUs,
|
||||
DisplayName: body.DisplayName,
|
||||
}, name, h.opts.App, "benchmark-nvidia")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -696,6 +737,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
@@ -1036,25 +1080,62 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
status := h.opts.App.LiveBootSource()
|
||||
status := h.currentRAMStatus()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(status)
|
||||
}
|
||||
|
||||
type ramStatusResponse struct {
|
||||
platform.LiveMediaRAMState
|
||||
InstallTaskActive bool `json:"install_task_active,omitempty"`
|
||||
CopyTaskActive bool `json:"copy_task_active,omitempty"`
|
||||
CanStartTask bool `json:"can_start_task,omitempty"`
|
||||
BlockedReason string `json:"blocked_reason,omitempty"`
|
||||
}
|
||||
|
||||
func (h *handler) currentRAMStatus() ramStatusResponse {
|
||||
state := h.opts.App.LiveMediaRAMState()
|
||||
resp := ramStatusResponse{LiveMediaRAMState: state}
|
||||
if globalQueue.hasActiveTarget("install") {
|
||||
resp.InstallTaskActive = true
|
||||
resp.BlockedReason = "install to disk is already running"
|
||||
return resp
|
||||
}
|
||||
if globalQueue.hasActiveTarget("install-to-ram") {
|
||||
resp.CopyTaskActive = true
|
||||
resp.BlockedReason = "install to RAM task is already pending or running"
|
||||
return resp
|
||||
}
|
||||
if state.InRAM {
|
||||
resp.BlockedReason = "system is already running from RAM"
|
||||
return resp
|
||||
}
|
||||
resp.CanStartTask = state.CanStartCopy
|
||||
if !resp.CanStartTask && resp.BlockedReason == "" {
|
||||
resp.BlockedReason = state.Message
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
if globalQueue.hasActiveTarget("install") {
|
||||
writeError(w, http.StatusConflict, "install to disk is already running")
|
||||
status := h.currentRAMStatus()
|
||||
if !status.CanStartTask {
|
||||
msg := strings.TrimSpace(status.BlockedReason)
|
||||
if msg == "" {
|
||||
msg = "install to RAM is not available"
|
||||
}
|
||||
writeError(w, http.StatusConflict, msg)
|
||||
return
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("install-to-ram"),
|
||||
Name: "Install to RAM",
|
||||
Target: "install-to-ram",
|
||||
Priority: 10,
|
||||
Priority: defaultTaskPriority("install-to-ram", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
@@ -1169,7 +1250,7 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
||||
ID: newJobID("install"),
|
||||
Name: "Install to Disk",
|
||||
Target: "install",
|
||||
Priority: 20,
|
||||
Priority: defaultTaskPriority("install", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
@@ -1445,6 +1526,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
|
||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||
}
|
||||
|
||||
func (h *handler) rollbackPendingNetworkChange() error {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
@@ -1461,4 +1547,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -39,6 +39,9 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||
t.Fatalf("burn profile=%q want smoke", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
@@ -61,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
@@ -75,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-benchmark" {
|
||||
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
|
||||
if task.Target != "nvidia-bench-perf" {
|
||||
t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
|
||||
}
|
||||
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||
@@ -84,6 +87,9 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
if task.params.RunNCCL {
|
||||
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
@@ -107,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
@@ -133,6 +139,56 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 3 {
|
||||
t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
|
||||
}
|
||||
for i, task := range globalQueue.tasks {
|
||||
if task.Target != "nvidia-bench-power" {
|
||||
t.Fatalf("task[%d] target=%q", i, task.Target)
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
@@ -175,6 +231,41 @@ func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||
got := []int{
|
||||
defaultTaskPriority("install-to-ram", taskParams{}),
|
||||
defaultTaskPriority("audit", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-perf", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-power", taskParams{}),
|
||||
}
|
||||
want := []int{
|
||||
taskPriorityInstallToRAM,
|
||||
taskPriorityAudit,
|
||||
taskPriorityValidate,
|
||||
taskPriorityValidateStress,
|
||||
taskPriorityBurn,
|
||||
taskPriorityBenchmark,
|
||||
taskPriorityBenchmark,
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
|
||||
t.Fatalf("priority order=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
|
||||
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||
return out
|
||||
}
|
||||
|
||||
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||
// power charts where the filled area of each PSU shows its individual
|
||||
// contribution and the total height equals the combined draw.
|
||||
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range datasets {
|
||||
if len(datasets[i]) == 0 {
|
||||
datasets[i] = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
// Build cumulative sums per time point.
|
||||
cumulative := make([][]float64, len(datasets)+1)
|
||||
for i := range cumulative {
|
||||
cumulative[i] = make([]float64, pointCount)
|
||||
}
|
||||
for i, ds := range datasets {
|
||||
for j, v := range ds {
|
||||
cumulative[i+1][j] = cumulative[i][j] + v
|
||||
}
|
||||
}
|
||||
|
||||
// Scale is based on the total (top cumulative row).
|
||||
total := cumulative[len(cumulative)-1]
|
||||
yMin := floatPtr(0)
|
||||
if yMax == nil {
|
||||
yMax = autoMax120(total)
|
||||
}
|
||||
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||
|
||||
legendItems := make([]metricChartSeries, len(datasets))
|
||||
for i, name := range names {
|
||||
color := metricChartPalette[i%len(metricChartPalette)]
|
||||
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||
}
|
||||
|
||||
// Stats label from totals.
|
||||
statsLabel := chartStatsLabel([][]float64{total})
|
||||
|
||||
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
writeSingleAxisY(&b, layout, scale)
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
|
||||
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
// Draw border polylines on top.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
|
||||
writeLegend(&b, layout, legendItems)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||
// (baseline and top), using the given color at 55% opacity.
|
||||
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||
n := len(top)
|
||||
if n == 0 {
|
||||
return
|
||||
}
|
||||
if len(baseline) < n {
|
||||
baseline = make([]float64, n)
|
||||
}
|
||||
|
||||
// Forward path along top values, then backward along baseline values.
|
||||
var points strings.Builder
|
||||
for i := 0; i < n; i++ {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
if i > 0 {
|
||||
points.WriteByte(' ')
|
||||
}
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
for i := n - 1; i >= 0; i-- {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
points.WriteByte(' ')
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||
}
|
||||
|
||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||
}
|
||||
|
||||
@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
|
||||
@@ -72,6 +72,13 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Component chips — one small square per device */
|
||||
.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
|
||||
.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
|
||||
.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||
@@ -363,23 +370,25 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
||||
}
|
||||
|
||||
cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
|
||||
writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
|
||||
writeRow("CPU", hwDescribeCPU(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))
|
||||
|
||||
memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
|
||||
writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
|
||||
writeRow("Memory", hwDescribeMemory(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))
|
||||
|
||||
storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
|
||||
writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
|
||||
writeRow("Storage", hwDescribeStorage(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))
|
||||
|
||||
gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
|
||||
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
|
||||
writeRow("GPU", hwDescribeGPU(hw),
|
||||
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))
|
||||
|
||||
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
|
||||
if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
|
||||
psuRow.Status = hwPSUStatus(hw.PowerSupplies)
|
||||
psuMatched := matchedRecords(records, nil, []string{"psu:"})
|
||||
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
|
||||
// No PSU records yet — synthesise a single chip from IPMI status.
|
||||
psuStatus := hwPSUStatus(hw.PowerSupplies)
|
||||
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
|
||||
}
|
||||
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
|
||||
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))
|
||||
|
||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||
writeRow("Network", nicDesc, "")
|
||||
@@ -845,6 +854,13 @@ func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
Source: "live-boot / /proc/mounts",
|
||||
Issue: "",
|
||||
}
|
||||
case "partial":
|
||||
return runtimeHealthRow{
|
||||
Title: "LiveCD in RAM",
|
||||
Status: "WARNING",
|
||||
Source: "live-boot / /proc/mounts / /dev/shm/bee-live",
|
||||
Issue: "Partial or staged RAM copy detected. System is not fully running from RAM; Copy to RAM can be retried.",
|
||||
}
|
||||
case "failed":
|
||||
return runtimeHealthRow{
|
||||
Title: "LiveCD in RAM",
|
||||
@@ -885,6 +901,31 @@ func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
|
||||
}
|
||||
}
|
||||
|
||||
// matchedRecords returns all ComponentStatusRecord entries whose key matches
|
||||
// any exact key or any of the given prefixes. Used for per-device chip rendering.
|
||||
func firstNonEmpty(vals ...string) string {
|
||||
for _, v := range vals {
|
||||
if v != "" {
|
||||
return v
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func matchedRecords(records []app.ComponentStatusRecord, exact []string, prefixes []string) []app.ComponentStatusRecord {
|
||||
var matched []app.ComponentStatusRecord
|
||||
for _, rec := range records {
|
||||
key := strings.TrimSpace(rec.ComponentKey)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
|
||||
matched = append(matched, rec)
|
||||
}
|
||||
}
|
||||
return matched
|
||||
}
|
||||
|
||||
func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
|
||||
matched := make([]app.ComponentStatusRecord, 0)
|
||||
for _, rec := range records {
|
||||
@@ -1027,6 +1068,52 @@ func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) str
|
||||
return strings.Join(messages, "; ")
|
||||
}
|
||||
|
||||
// chipLetterClass maps a component status to a single display letter and CSS class.
|
||||
func chipLetterClass(status string) (letter, cls string) {
|
||||
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||||
case "OK":
|
||||
return "O", "chip-ok"
|
||||
case "WARNING", "WARN", "PARTIAL":
|
||||
return "W", "chip-warn"
|
||||
case "CRITICAL", "FAIL", "FAILED", "ERROR":
|
||||
return "F", "chip-fail"
|
||||
default:
|
||||
return "?", "chip-unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// renderComponentChips renders one 20×20 chip per ComponentStatusRecord.
|
||||
// Hover tooltip shows component key, status, error summary and last check time.
|
||||
// Falls back to a single unknown chip when no records are available.
|
||||
func renderComponentChips(matched []app.ComponentStatusRecord) string {
|
||||
if len(matched) == 0 {
|
||||
return `<span class="chips"><span class="chip chip-unknown" title="No data">?</span></span>`
|
||||
}
|
||||
sort.Slice(matched, func(i, j int) bool {
|
||||
return matched[i].ComponentKey < matched[j].ComponentKey
|
||||
})
|
||||
var b strings.Builder
|
||||
b.WriteString(`<span class="chips">`)
|
||||
for _, rec := range matched {
|
||||
letter, cls := chipLetterClass(rec.Status)
|
||||
var tooltip strings.Builder
|
||||
tooltip.WriteString(rec.ComponentKey)
|
||||
tooltip.WriteString(": ")
|
||||
tooltip.WriteString(firstNonEmpty(rec.Status, "UNKNOWN"))
|
||||
if rec.ErrorSummary != "" {
|
||||
tooltip.WriteString(" — ")
|
||||
tooltip.WriteString(rec.ErrorSummary)
|
||||
}
|
||||
if !rec.LastCheckedAt.IsZero() {
|
||||
fmt.Fprintf(&tooltip, " (checked %s)", rec.LastCheckedAt.Format("15:04:05"))
|
||||
}
|
||||
fmt.Fprintf(&b, `<span class="chip %s" title="%s">%s</span>`,
|
||||
cls, html.EscapeString(tooltip.String()), letter)
|
||||
}
|
||||
b.WriteString(`</span>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func runtimeStatusBadge(status string) string {
|
||||
status = strings.ToUpper(strings.TrimSpace(status))
|
||||
badge := "badge-unknown"
|
||||
@@ -1291,15 +1378,64 @@ setInterval(loadMetricsLayout, 5000);
|
||||
// ── Validate (Acceptance Tests) ───────────────────────────────────────────────
|
||||
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
Storage string
|
||||
NVIDIA string
|
||||
AMD string
|
||||
CPU string
|
||||
Memory string
|
||||
Storage string
|
||||
NVIDIA string
|
||||
AMD string
|
||||
NvidiaGPUCount int
|
||||
AMDGPUCount int
|
||||
}
|
||||
|
||||
// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
|
||||
func validateFmtDur(secs int) string {
|
||||
if secs < 120 {
|
||||
return fmt.Sprintf("~%d s", secs)
|
||||
}
|
||||
mins := (secs + 29) / 60
|
||||
return fmt.Sprintf("~%d min", mins)
|
||||
}
|
||||
|
||||
// validateTotalValidateSec returns the estimated wall-clock duration of
|
||||
// "Validate one by one" in Validate mode for n NVIDIA GPUs.
|
||||
func validateTotalValidateSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUValidateSec +
|
||||
platform.SATEstimatedMemoryValidateSec +
|
||||
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
return total
|
||||
}
|
||||
|
||||
// validateTotalStressSec returns the estimated wall-clock duration of
|
||||
// "Validate one by one" in Stress mode for n NVIDIA GPUs.
|
||||
func validateTotalStressSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUStressSec +
|
||||
platform.SATEstimatedMemoryStressSec +
|
||||
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
||||
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
||||
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
||||
platform.SATEstimatedNvidiaPulseTestSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
return total
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
@@ -1309,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
@@ -1326,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
`60s in Validate, 30 min in Stress.`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Short self-test in Validate, extended self-test in Stress.`,
|
||||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
@@ -1363,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
|
||||
func() string {
|
||||
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
||||
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
||||
if n > 0 {
|
||||
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
||||
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
||||
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
||||
}
|
||||
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
||||
validateFmtDur(perV), validateFmtDur(perS))
|
||||
}(),
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
func() string {
|
||||
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
||||
s := "Skipped in Validate. "
|
||||
if n > 0 {
|
||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||
} else {
|
||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||
}
|
||||
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||
}(),
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
@@ -1378,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
func() string {
|
||||
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
||||
s := "Skipped in Validate. "
|
||||
if n > 0 {
|
||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||
} else {
|
||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||
}
|
||||
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||
}(),
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
@@ -1386,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
@@ -1394,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
@@ -1402,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
@@ -1440,8 +1604,6 @@ function satModeChanged() {
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
||||
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
@@ -1689,7 +1851,7 @@ function runAllSAT() {
|
||||
const cycles = 1;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
@@ -1837,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||
out.NvidiaGPUCount = nvidiaTotal
|
||||
out.AMDGPUCount = amdTotal
|
||||
return out
|
||||
}
|
||||
|
||||
@@ -1929,9 +2093,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||
|
||||
type benchmarkHistoryRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
gpuScores map[int]float64 // GPU index → composite score
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
gpuScores map[int]float64 // GPU index → composite score
|
||||
gpuStatuses map[int]string // GPU index → status ("OK", "WARNING", "FAILED", …)
|
||||
overallStatus string
|
||||
}
|
||||
|
||||
func renderBenchmark(opts HandlerOptions) string {
|
||||
@@ -1939,14 +2105,14 @@ func renderBenchmark(opts HandlerOptions) string {
|
||||
|
||||
<div class="grid2">
|
||||
<div class="card">
|
||||
<div class="card-head">NVIDIA Benchmark</div>
|
||||
<div class="card-head">Benchmark Setup</div>
|
||||
<div class="card-body">
|
||||
<div class="form-row">
|
||||
<label>Profile</label>
|
||||
<select id="benchmark-profile">
|
||||
<option value="standard" selected>Standard — about 15 minutes</option>
|
||||
<option value="stability">Stability — 1 to 2 hours</option>
|
||||
<option value="overnight">Overnight — 8 hours</option>
|
||||
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||||
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||||
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
@@ -1972,26 +2138,30 @@ func renderBenchmark(opts HandlerOptions) string {
|
||||
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
|
||||
</label>
|
||||
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||||
<button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>▶ Run Benchmark</button>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
|
||||
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>▶ Run Performance Benchmark</button>
|
||||
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>▶ Run Power / Thermal Fit</button>
|
||||
</div>
|
||||
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||||
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Method</div>
|
||||
<div class="card-head">Method Split</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p>
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||||
<table>
|
||||
<tr><th>Profile</th><th>Purpose</th></tr>
|
||||
<tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr>
|
||||
<tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr>
|
||||
<tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
|
||||
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||||
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||||
<tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||||
</table>
|
||||
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
` + renderBenchmarkResultsCard(opts.ExportDir) + `
|
||||
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
@@ -2029,21 +2199,24 @@ function benchmarkMode() {
|
||||
|
||||
function benchmarkUpdateSelectionNote() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const btn = document.getElementById('benchmark-run-btn');
|
||||
const perfBtn = document.getElementById('benchmark-run-performance-btn');
|
||||
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
|
||||
const note = document.getElementById('benchmark-selection-note');
|
||||
if (!selected.length) {
|
||||
btn.disabled = true;
|
||||
perfBtn.disabled = true;
|
||||
fitBtn.disabled = true;
|
||||
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||||
return;
|
||||
}
|
||||
btn.disabled = false;
|
||||
perfBtn.disabled = false;
|
||||
fitBtn.disabled = false;
|
||||
const mode = benchmarkMode();
|
||||
if (mode === 'ramp-up') {
|
||||
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.';
|
||||
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses targeted_power per step.';
|
||||
} else if (mode === 'parallel') {
|
||||
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : '');
|
||||
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
|
||||
} else {
|
||||
note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : '');
|
||||
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2117,7 +2290,7 @@ function benchmarkSelectNone() {
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
|
||||
function runNvidiaBenchmark() {
|
||||
function runNvidiaBenchmark(kind) {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
if (!selected.length) {
|
||||
@@ -2127,21 +2300,26 @@ function runNvidiaBenchmark() {
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
const mode = benchmarkMode();
|
||||
const rampUp = mode === 'ramp-up' && selected.length > 1;
|
||||
const parallelGPUs = mode === 'parallel';
|
||||
const parallelGPUs = mode === 'parallel' && kind === 'performance';
|
||||
if (kind === 'power-fit' && mode === 'parallel') {
|
||||
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
|
||||
return;
|
||||
}
|
||||
const body = {
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
gpu_indices: selected,
|
||||
run_nccl: selected.length > 1,
|
||||
run_nccl: kind === 'performance' && selected.length > 1,
|
||||
parallel_gpus: parallelGPUs,
|
||||
ramp_up: rampUp,
|
||||
display_name: 'NVIDIA Benchmark'
|
||||
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
|
||||
};
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n';
|
||||
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
|
||||
status.textContent = 'Queueing...';
|
||||
fetch('/api/benchmark/nvidia/run', {
|
||||
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
|
||||
fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
@@ -2169,7 +2347,9 @@ function runNvidiaBenchmark() {
|
||||
if (e.data) failures += 1;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
const isLast = (idx + 1 >= taskIds.length);
|
||||
streamNext(idx + 1, failures);
|
||||
if (isLast) { benchmarkRefreshResults(); }
|
||||
});
|
||||
benchmarkES.onerror = function() {
|
||||
if (benchmarkES) {
|
||||
@@ -2189,18 +2369,30 @@ function runNvidiaBenchmark() {
|
||||
}
|
||||
|
||||
benchmarkLoadGPUs();
|
||||
|
||||
function benchmarkRefreshResults() {
|
||||
fetch('/api/benchmark/results')
|
||||
.then(function(r) { return r.text(); })
|
||||
.then(function(html) {
|
||||
const el = document.getElementById('benchmark-results-section');
|
||||
if (el) el.innerHTML = html;
|
||||
})
|
||||
.catch(function() {});
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCard(exportDir string) string {
|
||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Benchmark Results",
|
||||
perf := renderBenchmarkResultsCardFromRuns(
|
||||
"Performance Results",
|
||||
"Composite score by saved benchmark run and GPU.",
|
||||
"No saved benchmark runs yet.",
|
||||
"No saved performance benchmark runs yet.",
|
||||
maxIdx,
|
||||
runs,
|
||||
)
|
||||
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||
return perf + "\n" + power
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||
@@ -2213,7 +2405,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<div style="overflow-x:auto">`)
|
||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
|
||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||
for i := 0; i <= maxGPUIndex; i++ {
|
||||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||
}
|
||||
@@ -2222,13 +2414,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
overallColor := "var(--ok)"
|
||||
overallLabel := run.overallStatus
|
||||
if overallLabel == "" {
|
||||
overallLabel = "OK"
|
||||
}
|
||||
if overallLabel == "FAILED" {
|
||||
overallColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if overallLabel != "OK" {
|
||||
overallColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||
score, ok := run.gpuScores[idx]
|
||||
if !ok {
|
||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||
continue
|
||||
}
|
||||
b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||
gpuStatus := run.gpuStatuses[idx]
|
||||
scoreColor := ""
|
||||
switch gpuStatus {
|
||||
case "FAILED":
|
||||
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||
case "WARNING", "PARTIAL":
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
case "", "OK":
|
||||
// no override
|
||||
default:
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
}
|
||||
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||
}
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
@@ -2237,11 +2452,11 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
||||
}
|
||||
|
||||
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
|
||||
baseDir := app.DefaultBenchmarkBaseDir
|
||||
baseDir := app.DefaultBeeBenchPerfDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-benchmark")
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return -1, nil
|
||||
}
|
||||
@@ -2262,12 +2477,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
|
||||
continue
|
||||
}
|
||||
run := benchmarkHistoryRun{
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
gpuScores: make(map[int]float64),
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
gpuScores: make(map[int]float64),
|
||||
gpuStatuses: make(map[int]string),
|
||||
overallStatus: result.OverallStatus,
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||
if gpu.Index > maxGPUIndex {
|
||||
maxGPUIndex = gpu.Index
|
||||
}
|
||||
@@ -2280,12 +2498,146 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
|
||||
return maxGPUIndex, runs
|
||||
}
|
||||
|
||||
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
baseDir := app.DefaultBeeBenchPowerDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
type powerRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
result platform.NvidiaPowerBenchResult
|
||||
}
|
||||
var runs []powerRun
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var r platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &r); err != nil {
|
||||
continue
|
||||
}
|
||||
runs = append(runs, powerRun{
|
||||
generatedAt: r.GeneratedAt,
|
||||
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
result: r,
|
||||
})
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
|
||||
// Show only the most recent run's GPU slot table, plus a run history summary.
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||
|
||||
latest := runs[0].result
|
||||
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||
if latest.Hostname != "" {
|
||||
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||
}
|
||||
if latest.OverallStatus != "" {
|
||||
statusColor := "var(--ok)"
|
||||
if latest.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||
}
|
||||
b.WriteString(`</p>`)
|
||||
|
||||
if len(latest.GPUs) > 0 {
|
||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for _, gpu := range latest.GPUs {
|
||||
// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
|
||||
// falling back to single-card applied limit if the ramp hasn't run.
|
||||
finalLimitW := gpu.StablePowerLimitW
|
||||
if finalLimitW <= 0 {
|
||||
finalLimitW = gpu.AppliedPowerLimitW
|
||||
}
|
||||
// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
|
||||
derated := gpu.Derated ||
|
||||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||
rowStyle := ""
|
||||
finalStyle := ""
|
||||
if derated {
|
||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||
}
|
||||
statusLabel := gpu.Status
|
||||
if statusLabel == "" {
|
||||
statusLabel = "OK"
|
||||
}
|
||||
statusColor := "var(--ok)"
|
||||
if statusLabel == "FAILED" {
|
||||
statusColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if statusLabel != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
nominalStr := "-"
|
||||
if gpu.DefaultPowerLimitW > 0 {
|
||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||
}
|
||||
singleStr := "-"
|
||||
if gpu.AppliedPowerLimitW > 0 {
|
||||
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||
}
|
||||
multiStr := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||
}
|
||||
p95Str := "-"
|
||||
if gpu.MaxObservedPowerW > 0 {
|
||||
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||
}
|
||||
b.WriteString(`<tr` + rowStyle + `>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div>`)
|
||||
}
|
||||
|
||||
if len(runs) > 1 {
|
||||
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
statusColor := "var(--ok)"
|
||||
if run.result.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></details>`)
|
||||
}
|
||||
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
@@ -2293,13 +2645,13 @@ func renderBurn() string {
|
||||
<div class="card-body burn-profile-body">
|
||||
<div class="burn-profile-col">
|
||||
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — quick check (~5 min)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 hour</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
|
||||
<p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
@@ -3245,12 +3597,19 @@ fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
if (d.in_ram) {
|
||||
txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
|
||||
txt.textContent = d.message || 'Checking...';
|
||||
if (d.status === 'ok' || d.in_ram) {
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
} else if (d.status === 'failed') {
|
||||
txt.style.color = 'var(--err, #b91c1c)';
|
||||
} else {
|
||||
txt.textContent = 'Live media is mounted from installation device. Copy to RAM to allow media removal.';
|
||||
txt.style.color = 'var(--muted)';
|
||||
}
|
||||
if (d.can_start_task) {
|
||||
btn.style.display = '';
|
||||
btn.disabled = false;
|
||||
} else {
|
||||
btn.style.display = 'none';
|
||||
}
|
||||
});
|
||||
function installToRAM() {
|
||||
|
||||
@@ -261,7 +261,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||
|
||||
// Tasks
|
||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||
@@ -573,12 +575,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
timeline := metricsTimelineSegments(samples, time.Now())
|
||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||
var overviewOk bool
|
||||
var buf []byte
|
||||
buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if !ok {
|
||||
if !overviewOk {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
@@ -587,23 +591,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
_, _ = w.Write(buf)
|
||||
return
|
||||
}
|
||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||
if !ok {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
buf, err := renderMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMin,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
var buf []byte
|
||||
if stacked {
|
||||
buf, err = renderStackedMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
} else {
|
||||
buf, err = renderMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMin,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
}
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
@@ -613,12 +631,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var title string
|
||||
var yMin, yMax *float64
|
||||
labels := sampleTimeLabels(samples)
|
||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
|
||||
labels = sampleTimeLabels(samples)
|
||||
|
||||
switch {
|
||||
case path == "server-load":
|
||||
@@ -654,15 +668,41 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
power := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
// Use per-PSU stacked chart when PSU SDR data is available.
|
||||
// Collect the union of PSU slots seen across all samples.
|
||||
psuSlots := psuSlotsFromSamples(samples)
|
||||
if len(psuSlots) > 1 {
|
||||
// Build one dataset per PSU slot.
|
||||
psuDatasets := make([][]float64, len(psuSlots))
|
||||
psuNames := make([]string, len(psuSlots))
|
||||
for si, slot := range psuSlots {
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, psu := range s.PSUs {
|
||||
if psu.Slot == slot {
|
||||
ds[i] = psu.PowerW
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
psuDatasets[si] = normalizePowerSeries(ds)
|
||||
psuNames[si] = fmt.Sprintf("PSU %d", slot)
|
||||
}
|
||||
datasets = psuDatasets
|
||||
names = psuNames
|
||||
stacked = true
|
||||
yMax = autoMax120(psuStackedTotal(psuDatasets))
|
||||
} else {
|
||||
power := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
}
|
||||
power = normalizePowerSeries(power)
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(power)
|
||||
}
|
||||
power = normalizePowerSeries(power)
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(power)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
@@ -705,7 +745,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
case strings.HasPrefix(path, "gpu/"):
|
||||
idx, sub, ok := parseGPUChartPath(path)
|
||||
if !ok {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
switch sub {
|
||||
case "load":
|
||||
@@ -713,7 +753,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
if util == nil && mem == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||
names = []string{"Load %", "Mem %"}
|
||||
@@ -723,7 +763,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
title = gpuDisplayLabel(idx) + " Temperature"
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
if temp == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{temp}
|
||||
names = []string{"Temp °C"}
|
||||
@@ -733,7 +773,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||
if clock == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{clock}
|
||||
names = []string{"Core Clock MHz"}
|
||||
@@ -742,7 +782,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||
if clock == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{clock}
|
||||
names = []string{"Memory Clock MHz"}
|
||||
@@ -751,7 +791,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
title = gpuDisplayLabel(idx) + " Power"
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
if power == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
@@ -759,10 +799,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
}
|
||||
|
||||
default:
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
return nil, nil, nil, "", nil, nil, false, false
|
||||
}
|
||||
|
||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
||||
return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
|
||||
}
|
||||
|
||||
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
||||
@@ -928,6 +968,37 @@ func normalizePowerSeries(ds []float64) []float64 {
|
||||
return out
|
||||
}
|
||||
|
||||
// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
|
||||
func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
|
||||
seen := map[int]struct{}{}
|
||||
for _, s := range samples {
|
||||
for _, p := range s.PSUs {
|
||||
seen[p.Slot] = struct{}{}
|
||||
}
|
||||
}
|
||||
slots := make([]int, 0, len(seen))
|
||||
for s := range seen {
|
||||
slots = append(slots, s)
|
||||
}
|
||||
sort.Ints(slots)
|
||||
return slots
|
||||
}
|
||||
|
||||
// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
|
||||
func psuStackedTotal(datasets [][]float64) []float64 {
|
||||
if len(datasets) == 0 {
|
||||
return nil
|
||||
}
|
||||
n := len(datasets[0])
|
||||
total := make([]float64, n)
|
||||
for _, ds := range datasets {
|
||||
for i, v := range ds {
|
||||
total[i] += v
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func normalizeFanSeries(ds []float64) []float64 {
|
||||
if len(ds) == 0 {
|
||||
return nil
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestChartLegendNumber(t *testing.T) {
|
||||
@@ -78,6 +79,16 @@ func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
|
||||
row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
|
||||
if row.Status != "WARNING" {
|
||||
t.Fatalf("status=%q want WARNING", row.Status)
|
||||
}
|
||||
if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
|
||||
t.Fatalf("issue=%q", row.Issue)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
@@ -109,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
@@ -153,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
@@ -198,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||
if !ok {
|
||||
t.Fatal("gpu-all-clock returned ok=false")
|
||||
}
|
||||
@@ -637,8 +648,11 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
`href="/benchmark"`,
|
||||
`id="benchmark-gpu-list"`,
|
||||
`/api/gpu/nvidia`,
|
||||
`/api/benchmark/nvidia/run`,
|
||||
`/api/bee-bench/nvidia/perf/run`,
|
||||
`/api/bee-bench/nvidia/power/run`,
|
||||
`benchmark-run-nccl`,
|
||||
`Run Performance Benchmark`,
|
||||
`Run Power / Thermal Fit`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||
@@ -649,7 +663,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -691,7 +705,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
body := rec.Body.String()
|
||||
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Perf Results`,
|
||||
`Composite score by saved benchmark run and GPU.`,
|
||||
`GPU 0`,
|
||||
`GPU 1`,
|
||||
@@ -730,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`Validate and Stress:`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`nvbandwidth runs all built-in tests without a time limit`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
@@ -1113,8 +1147,8 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
`>Storage<`,
|
||||
`>GPU<`,
|
||||
`>PSU<`,
|
||||
`badge-warn`, // cpu Warning badge
|
||||
`badge-err`, // storage Critical badge
|
||||
`badge-warn`, // cpu Warning badge
|
||||
`badge-err`, // storage Critical badge
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||
|
||||
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
|
||||
}
|
||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||
}
|
||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||
if !ok {
|
||||
return "", nil, false
|
||||
}
|
||||
buf, err := renderMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMin,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
var buf []byte
|
||||
var err error
|
||||
if stacked {
|
||||
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||
} else {
|
||||
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||
}
|
||||
if err != nil {
|
||||
return "", nil, false
|
||||
}
|
||||
@@ -233,6 +229,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||
b.WriteString(benchmarkCard)
|
||||
}
|
||||
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||
b.WriteString(powerCard)
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
@@ -251,7 +250,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
}
|
||||
|
||||
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia-bench-perf":
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
@@ -263,7 +264,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
return ""
|
||||
}
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Benchmark Results",
|
||||
"Perf Results",
|
||||
"Composite score for this benchmark task.",
|
||||
"No benchmark results were saved for this task.",
|
||||
columns,
|
||||
@@ -271,15 +272,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
)
|
||||
}
|
||||
|
||||
func renderTaskPowerResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-bench-power" {
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
raw, err := os.ReadFile(resultPath)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
var result platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
|
||||
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
|
||||
}
|
||||
b.WriteString(`</table></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func taskBenchmarkResultPath(logText string) string {
|
||||
archivePath := taskArchivePathFromLog(logText)
|
||||
if archivePath == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
if runDir == archivePath {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(runDir, "result.json")
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,8 @@ const (
|
||||
var taskNames = map[string]string{
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
||||
"nvidia-benchmark": "NVIDIA Benchmark",
|
||||
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
|
||||
"nvidia-bench-power": "NVIDIA Bee Bench Power",
|
||||
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
||||
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
||||
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
||||
@@ -161,6 +162,32 @@ type nvidiaRampSpec struct {
|
||||
TotalDurationSec int
|
||||
}
|
||||
|
||||
func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
|
||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||
case "overnight":
|
||||
return 1024, 2
|
||||
case "acceptance":
|
||||
return 1024, 1
|
||||
case "smoke":
|
||||
return 256, 1
|
||||
}
|
||||
if stress {
|
||||
return 512, 1
|
||||
}
|
||||
return 256, 1
|
||||
}
|
||||
|
||||
func taskMayLeaveOrphanWorkers(target string) bool {
|
||||
switch strings.TrimSpace(strings.ToLower(target)) {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
|
||||
"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func resolveBurnPreset(profile string) burnPreset {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
@@ -586,8 +613,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
}
|
||||
a := q.opts.App
|
||||
|
||||
recovered := len(j.lines) > 0
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
if len(j.lines) > 0 {
|
||||
if recovered {
|
||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||
}
|
||||
|
||||
@@ -628,7 +656,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
dur = 300
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-benchmark":
|
||||
case "nvidia-bench-perf":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
@@ -644,6 +672,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -696,15 +737,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: platform.NvidiaStressLoaderNCCL,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
}, j.append)
|
||||
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -737,10 +770,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
sizeMB, passes := 256, 1
|
||||
if t.params.StressMode {
|
||||
sizeMB, passes = 1024, 3
|
||||
}
|
||||
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
@@ -996,6 +1027,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
@@ -1023,6 +1057,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
@@ -1127,10 +1164,13 @@ func (q *taskQueue) loadLocked() {
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker) survive the restart in their own
|
||||
// process groups and cannot be cancelled retroactively. Mark the
|
||||
// task as failed so the user can decide whether to re-run it
|
||||
// rather than blindly re-launching duplicate workers.
|
||||
// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
|
||||
// their own process groups. Kill any matching stale workers before
|
||||
// marking the task failed so the next GPU test does not inherit a
|
||||
// busy DCGM slot or duplicate workers.
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
_ = platform.KillTestWorkers()
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
|
||||
@@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
|
||||
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-bench",
|
||||
Name: "NVIDIA Benchmark",
|
||||
Target: "nvidia-benchmark",
|
||||
Name: "NVIDIA Bee Bench Perf",
|
||||
Target: "nvidia-bench-perf",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||
ArtifactsDir: artifactsDir,
|
||||
}
|
||||
ensureTaskReportPaths(task)
|
||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
|
||||
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -420,7 +420,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
}
|
||||
html := string(body)
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Perf Results`,
|
||||
`Composite score for this benchmark task.`,
|
||||
`GPU 0`,
|
||||
`1176.25`,
|
||||
@@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
|
||||
var gotSizeMB, gotPasses int
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "mem-validate-1",
|
||||
Name: "Memory SAT",
|
||||
Target: "memory",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{StressMode: true},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runMemoryAcceptancePackCtx
|
||||
runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
|
||||
gotSizeMB = sizeMB
|
||||
gotPasses = passes
|
||||
return "/tmp/memory-validate.tar.gz", nil
|
||||
}
|
||||
defer func() { runMemoryAcceptancePackCtx = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotSizeMB != 512 || gotPasses != 1 {
|
||||
t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
|
||||
@@ -1,5 +1,34 @@
|
||||
# Benchmark clock calibration research
|
||||
|
||||
## Benchmark methodology versioning
|
||||
|
||||
Every benchmark methodology change must bump the benchmark version constant in
|
||||
source code by exactly `+1`.
|
||||
|
||||
Methodology change means any change that affects comparability of benchmark
|
||||
results, including for example:
|
||||
- phase durations or phase order
|
||||
- enabled/disabled precisions
|
||||
- fallback rules
|
||||
- normalization rules
|
||||
- score formulas or weights
|
||||
- degradation thresholds
|
||||
- power calibration logic
|
||||
- thermal/power penalty logic
|
||||
|
||||
Requirements:
|
||||
- benchmark version must be stored in source code as an explicit version
|
||||
constant, not inferred from git tag or build metadata
|
||||
- benchmark report must always print the benchmark version
|
||||
- `result.json` must always include the benchmark version
|
||||
- results from different benchmark versions must be treated as non-comparable by
|
||||
default
|
||||
|
||||
Purpose:
|
||||
- prevent accidental comparison of runs produced by different methodologies
|
||||
- make historical benchmark archives self-describing even when detached from git
|
||||
- force deliberate version bumps whenever scoring or execution semantics change
|
||||
|
||||
## Status
|
||||
In progress. Baseline data from production servers pending.
|
||||
|
||||
|
||||
@@ -15,6 +15,41 @@ This applies to:
|
||||
- `iso/builder/config/package-lists/*.list.chroot`
|
||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||
|
||||
## Bootloader sync rule
|
||||
|
||||
The ISO has two independent bootloader configs that must be kept in sync manually:
|
||||
|
||||
| File | Used by |
|
||||
|------|---------|
|
||||
| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
|
||||
| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
|
||||
|
||||
live-build does NOT derive one from the other. Any new boot entry, kernel parameter
|
||||
change, or new mode added to one file must be manually mirrored in the other.
|
||||
|
||||
**Canonical entry list** (both files must have all of these):
|
||||
|
||||
| Label | Key params |
|
||||
|-------|-----------|
|
||||
| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
|
||||
| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
|
||||
| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
|
||||
| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
|
||||
| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
|
||||
| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
|
||||
|
||||
**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
|
||||
```
|
||||
net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
|
||||
numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
nowatchdog nosoftlockup
|
||||
```
|
||||
(fail-safe is the exception — it deliberately uses minimal params.)
|
||||
|
||||
**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
|
||||
live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
|
||||
configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
|
||||
|
||||
## Memtest rule
|
||||
|
||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
DEBIAN_VERSION=12
|
||||
DEBIAN_KERNEL_ABI=auto
|
||||
NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUBLAS_VERSION=13.1.1.3-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=1.0.0
|
||||
MEMTEST_VERSION=6.10-4
|
||||
|
||||
@@ -23,9 +23,9 @@ lb config noauto \
|
||||
--bootloaders "grub-efi,syslinux" \
|
||||
--debian-installer none \
|
||||
--archive-areas "main contrib non-free non-free-firmware" \
|
||||
--mirror-bootstrap "https://deb.debian.org/debian" \
|
||||
--mirror-chroot "https://deb.debian.org/debian" \
|
||||
--mirror-binary "https://deb.debian.org/debian" \
|
||||
--mirror-bootstrap "http://mirror.mephi.ru/debian/" \
|
||||
--mirror-chroot "http://mirror.mephi.ru/debian/" \
|
||||
--mirror-binary "http://mirror.mephi.ru/debian/" \
|
||||
--security true \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
@@ -33,6 +33,7 @@ lb config noauto \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--debootstrap-options "--include=ca-certificates" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -35,6 +35,8 @@ typedef void *CUstream;
|
||||
#define MAX_STRESS_STREAMS 16
|
||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||
#define MAX_SINGLE_PRECISION_STREAMS 4
|
||||
#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
|
||||
|
||||
static const char *ptx_source =
|
||||
".version 6.0\n"
|
||||
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
|
||||
return stream_count;
|
||||
}
|
||||
|
||||
static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
|
||||
if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
|
||||
return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
return profile_budget_bytes;
|
||||
}
|
||||
|
||||
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||
if (!api->cuStreamDestroy) {
|
||||
return;
|
||||
@@ -642,6 +651,20 @@ static const struct profile_desc k_profiles[] = {
|
||||
CUDA_R_16F,
|
||||
CUBLAS_COMPUTE_32F_FAST_16F,
|
||||
},
|
||||
{
|
||||
"int8_tensor",
|
||||
"int8",
|
||||
75,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
128,
|
||||
CUDA_R_8I,
|
||||
CUDA_R_8I,
|
||||
CUDA_R_32I,
|
||||
CUDA_R_32I,
|
||||
CUBLAS_COMPUTE_32I,
|
||||
},
|
||||
{
|
||||
"fp8_e4m3",
|
||||
"fp8",
|
||||
@@ -690,6 +713,19 @@ static const struct profile_desc k_profiles[] = {
|
||||
|
||||
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
|
||||
|
||||
static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
|
||||
if (!(desc->enabled && cc >= desc->min_cc)) {
|
||||
return 0;
|
||||
}
|
||||
if (precision_filter != NULL) {
|
||||
return strcmp(desc->block_label, precision_filter) == 0;
|
||||
}
|
||||
/* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
|
||||
* unstable on the current benchmark fleet and can abort the whole mixed
|
||||
* pass after earlier phases already collected useful telemetry. */
|
||||
return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
|
||||
}
|
||||
|
||||
static int load_cublaslt(struct cublaslt_api *api) {
|
||||
memset(api, 0, sizeof(*api));
|
||||
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
||||
@@ -760,10 +796,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
|
||||
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||
switch (type) {
|
||||
case CUDA_R_32F:
|
||||
case CUDA_R_32I:
|
||||
return (size_t)(elements * 4u);
|
||||
case CUDA_R_16F:
|
||||
case CUDA_R_16BF:
|
||||
return (size_t)(elements * 2u);
|
||||
case CUDA_R_8I:
|
||||
case CUDA_R_8F_E4M3:
|
||||
case CUDA_R_8F_E5M2:
|
||||
return (size_t)(elements);
|
||||
@@ -776,6 +814,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||
}
|
||||
}
|
||||
|
||||
static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
|
||||
if (desc->compute_type == CUBLAS_COMPUTE_32I) {
|
||||
return CUDA_R_32I;
|
||||
}
|
||||
if (desc->compute_type == CUBLAS_COMPUTE_64F) {
|
||||
return CUDA_R_64F;
|
||||
}
|
||||
return CUDA_R_32F;
|
||||
}
|
||||
|
||||
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
||||
uint64_t row_tiles = (rows + 127u) / 128u;
|
||||
uint64_t col_tiles = (cols + 63u) / 64u;
|
||||
@@ -882,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
CUstream stream,
|
||||
size_t profile_budget_bytes,
|
||||
struct prepared_profile *out) {
|
||||
memset(out, 0, sizeof(*out));
|
||||
out->desc = *desc;
|
||||
out->stream = stream;
|
||||
|
||||
size_t bytes_per_cell = 0;
|
||||
size_t attempt_budget = profile_budget_bytes;
|
||||
|
||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
||||
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
||||
@@ -895,105 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
|
||||
out->m = dim;
|
||||
out->n = dim;
|
||||
out->k = dim;
|
||||
while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
|
||||
memset(out, 0, sizeof(*out));
|
||||
out->desc = *desc;
|
||||
out->stream = stream;
|
||||
|
||||
size_t desired_workspace = profile_budget_bytes / 8u;
|
||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||
desired_workspace = 32u * 1024u * 1024u;
|
||||
}
|
||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||
uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
|
||||
out->m = dim;
|
||||
out->n = dim;
|
||||
out->k = dim;
|
||||
|
||||
size_t a_bytes = 0;
|
||||
size_t b_bytes = 0;
|
||||
size_t c_bytes = 0;
|
||||
size_t d_bytes = 0;
|
||||
size_t scale_bytes = 0;
|
||||
while (1) {
|
||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||
size_t desired_workspace = attempt_budget / 8u;
|
||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||
desired_workspace = 32u * 1024u * 1024u;
|
||||
}
|
||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||
|
||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||
if (matrix_bytes <= profile_budget_bytes) {
|
||||
size_t remaining = profile_budget_bytes - matrix_bytes;
|
||||
out->workspace_size = desired_workspace;
|
||||
if (out->workspace_size > remaining) {
|
||||
out->workspace_size = round_down_size(remaining, 256u);
|
||||
size_t a_bytes = 0;
|
||||
size_t b_bytes = 0;
|
||||
size_t c_bytes = 0;
|
||||
size_t d_bytes = 0;
|
||||
size_t scale_bytes = 0;
|
||||
while (1) {
|
||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||
|
||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||
if (matrix_bytes <= attempt_budget) {
|
||||
size_t remaining = attempt_budget - matrix_bytes;
|
||||
out->workspace_size = desired_workspace;
|
||||
if (out->workspace_size > remaining) {
|
||||
out->workspace_size = round_down_size(remaining, 256u);
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||
break;
|
||||
}
|
||||
out->m -= (uint64_t)desc->min_multiple;
|
||||
out->n = out->m;
|
||||
out->k = out->m;
|
||||
}
|
||||
if (out->m < (uint64_t)desc->min_multiple) {
|
||||
attempt_budget /= 2u;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||
return 0;
|
||||
}
|
||||
out->m -= (uint64_t)desc->min_multiple;
|
||||
out->n = out->m;
|
||||
out->k = out->m;
|
||||
}
|
||||
|
||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
cublasOperation_t transa = CUBLAS_OP_T;
|
||||
cublasOperation_t transb = CUBLAS_OP_N;
|
||||
if (!check_cublas("set TRANSA",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||
&transa,
|
||||
sizeof(transa))) ||
|
||||
!check_cublas("set TRANSB",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||
&transb,
|
||||
sizeof(transb)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (desc->needs_scalar_scale) {
|
||||
float one = 1.0f;
|
||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||
|
||||
cudaDataType_t scale_type = matmul_scale_type(desc);
|
||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||
if (!check_cublas("set A scale ptr",
|
||||
|
||||
cublasOperation_t transa = CUBLAS_OP_T;
|
||||
cublasOperation_t transb = CUBLAS_OP_N;
|
||||
if (!check_cublas("set TRANSA",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||
&a_scale_ptr,
|
||||
sizeof(a_scale_ptr))) ||
|
||||
!check_cublas("set B scale ptr",
|
||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||
&transa,
|
||||
sizeof(transa))) ||
|
||||
!check_cublas("set TRANSB",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||
&b_scale_ptr,
|
||||
sizeof(b_scale_ptr)))) {
|
||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||
&transb,
|
||||
sizeof(transb)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (desc->needs_scalar_scale) {
|
||||
float one = 1.0f;
|
||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||
if (!check_cublas("set A scale ptr",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||
&a_scale_ptr,
|
||||
sizeof(a_scale_ptr))) ||
|
||||
!check_cublas("set B scale ptr",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||
&b_scale_ptr,
|
||||
sizeof(b_scale_ptr)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
||||
if (desc->needs_block_scale) {
|
||||
@@ -1033,78 +1089,94 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!check_cublas("create A layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||
!check_cublas("create B layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||
!check_cublas("create C layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||
!check_cublas("create D layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (out->workspace_size > 0) {
|
||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||
if (!check_cublas("create A layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||
!check_cublas("create B layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||
!check_cublas("create C layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||
!check_cublas("create D layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (out->workspace_size > 0) {
|
||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (!check_cublas("set workspace",
|
||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||
out->preference,
|
||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||
&out->workspace_size,
|
||||
sizeof(out->workspace_size)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int found = 0;
|
||||
if (check_cublas("heuristic",
|
||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||
out->op_desc,
|
||||
out->a_layout,
|
||||
out->b_layout,
|
||||
out->c_layout,
|
||||
out->d_layout,
|
||||
out->preference,
|
||||
1,
|
||||
&out->heuristic,
|
||||
&found)) &&
|
||||
found > 0) {
|
||||
out->ready = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
destroy_profile(cublas, cuda, out);
|
||||
attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
|
||||
if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!check_cublas("set workspace",
|
||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||
out->preference,
|
||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||
&out->workspace_size,
|
||||
sizeof(out->workspace_size)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int found = 0;
|
||||
if (!check_cublas("heuristic",
|
||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||
out->op_desc,
|
||||
out->a_layout,
|
||||
out->b_layout,
|
||||
out->c_layout,
|
||||
out->d_layout,
|
||||
out->preference,
|
||||
1,
|
||||
&out->heuristic,
|
||||
&found))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (found <= 0) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
out->ready = 1;
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int run_cublas_profile(cublasLtHandle_t handle,
|
||||
struct cublaslt_api *cublas,
|
||||
struct prepared_profile *profile) {
|
||||
int32_t alpha_i32 = 1;
|
||||
int32_t beta_i32 = 0;
|
||||
double alpha_f64 = 1.0;
|
||||
double beta_f64 = 0.0;
|
||||
float alpha = 1.0f;
|
||||
float beta = 0.0f;
|
||||
const void *alpha_ptr = α
|
||||
const void *beta_ptr = β
|
||||
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
|
||||
alpha_ptr = &alpha_i32;
|
||||
beta_ptr = &beta_i32;
|
||||
} else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
|
||||
alpha_ptr = &alpha_f64;
|
||||
beta_ptr = &beta_f64;
|
||||
}
|
||||
return check_cublas(profile->desc.name,
|
||||
cublas->cublasLtMatmul(handle,
|
||||
profile->op_desc,
|
||||
&alpha,
|
||||
alpha_ptr,
|
||||
(const void *)(uintptr_t)profile->a_dev,
|
||||
profile->a_layout,
|
||||
(const void *)(uintptr_t)profile->b_dev,
|
||||
profile->b_layout,
|
||||
&beta,
|
||||
beta_ptr,
|
||||
(const void *)(uintptr_t)profile->c_dev,
|
||||
profile->c_layout,
|
||||
(void *)(uintptr_t)profile->d_dev,
|
||||
@@ -1140,6 +1212,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
size_t requested_budget = 0;
|
||||
size_t total_budget = 0;
|
||||
size_t per_profile_budget = 0;
|
||||
int budget_profiles = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||
@@ -1160,9 +1233,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count profiles matching the filter (for deciding what to run). */
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
|
||||
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
|
||||
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||
planned++;
|
||||
}
|
||||
}
|
||||
@@ -1173,18 +1246,42 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count all profiles active on this GPU regardless of filter.
|
||||
* Mixed phases still divide budget across the full precision set, while
|
||||
* single-precision benchmark phases dedicate budget only to active
|
||||
* profiles matching precision_filter. */
|
||||
int planned_total = 0;
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||
planned_total++;
|
||||
}
|
||||
}
|
||||
if (planned_total < planned) {
|
||||
planned_total = planned;
|
||||
}
|
||||
budget_profiles = planned_total;
|
||||
if (precision_filter != NULL) {
|
||||
budget_profiles = planned;
|
||||
}
|
||||
if (budget_profiles <= 0) {
|
||||
budget_profiles = planned_total;
|
||||
}
|
||||
|
||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||
cuda->cuStreamCreate &&
|
||||
cuda->cuStreamDestroy) {
|
||||
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
||||
stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
|
||||
}
|
||||
if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
|
||||
stream_count = MAX_SINGLE_PRECISION_STREAMS;
|
||||
}
|
||||
if (stream_count > 1) {
|
||||
int created = 0;
|
||||
@@ -1197,18 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
}
|
||||
report->stream_count = stream_count;
|
||||
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
||||
per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
|
||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (precision_filter != NULL) {
|
||||
per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
|
||||
}
|
||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
mp_count,
|
||||
budget_profiles,
|
||||
per_profile_budget / (1024u * 1024u));
|
||||
|
||||
for (int i = 0; i < profile_count; i++) {
|
||||
@@ -1221,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
desc->min_cc);
|
||||
continue;
|
||||
}
|
||||
if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
|
||||
if (!profile_allowed_for_run(desc, cc, precision_filter)) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=SKIPPED precision_filter\n",
|
||||
"%s=SKIPPED benchmark_disabled\n",
|
||||
desc->name);
|
||||
continue;
|
||||
}
|
||||
@@ -1345,11 +1446,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
#endif
|
||||
|
||||
static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
|
||||
printf("device=%s\n", report->device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
|
||||
printf("backend=%s\n", report->backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
printf("buffer_mb=%d\n", report->buffer_mb);
|
||||
printf("streams=%d\n", report->stream_count);
|
||||
printf("iterations=%lu\n", report->iterations);
|
||||
printf("checksum=%llu\n", (unsigned long long)report->checksum);
|
||||
if (report->details[0] != '\0') {
|
||||
printf("%s", report->details);
|
||||
}
|
||||
printf("status=OK\n");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int seconds = 5;
|
||||
int size_mb = 64;
|
||||
int device_index = 0;
|
||||
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
|
||||
const char *precision_plan = NULL;
|
||||
const char *precision_plan_seconds = NULL;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||
seconds = atoi(argv[++i]);
|
||||
@@ -1359,9 +1478,13 @@ int main(int argc, char **argv) {
|
||||
device_index = atoi(argv[++i]);
|
||||
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
|
||||
precision_filter = argv[++i];
|
||||
} else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
|
||||
precision_plan = argv[++i];
|
||||
} else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
|
||||
precision_plan_seconds = argv[++i];
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n",
|
||||
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
|
||||
argv[0]);
|
||||
return 2;
|
||||
}
|
||||
@@ -1422,26 +1545,94 @@ int main(int argc, char **argv) {
|
||||
int ok = 0;
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
if (precision_plan != NULL && precision_plan[0] != '\0') {
|
||||
char *plan_copy = strdup(precision_plan);
|
||||
char *plan_seconds_copy = NULL;
|
||||
int phase_seconds[32] = {0};
|
||||
int phase_seconds_count = 0;
|
||||
int phase_ok = 0;
|
||||
if (plan_copy == NULL) {
|
||||
fprintf(stderr, "failed to allocate precision plan buffer\n");
|
||||
return 1;
|
||||
}
|
||||
if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
|
||||
plan_seconds_copy = strdup(precision_plan_seconds);
|
||||
if (plan_seconds_copy == NULL) {
|
||||
free(plan_copy);
|
||||
fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
|
||||
return 1;
|
||||
}
|
||||
for (char *sec_token = strtok(plan_seconds_copy, ",");
|
||||
sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
|
||||
sec_token = strtok(NULL, ",")) {
|
||||
while (*sec_token == ' ' || *sec_token == '\t') {
|
||||
sec_token++;
|
||||
}
|
||||
if (*sec_token == '\0') {
|
||||
continue;
|
||||
}
|
||||
phase_seconds[phase_seconds_count++] = atoi(sec_token);
|
||||
}
|
||||
}
|
||||
int phase_idx = 0;
|
||||
for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
|
||||
while (*token == ' ' || *token == '\t') {
|
||||
token++;
|
||||
}
|
||||
if (*token == '\0') {
|
||||
continue;
|
||||
}
|
||||
const char *phase_name = token;
|
||||
const char *phase_filter = token;
|
||||
if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
|
||||
phase_filter = NULL;
|
||||
}
|
||||
int phase_duration = seconds;
|
||||
if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
|
||||
phase_duration = phase_seconds[phase_idx];
|
||||
}
|
||||
printf("phase_begin=%s\n", phase_name);
|
||||
fflush(stdout);
|
||||
memset(&report, 0, sizeof(report));
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
|
||||
if (ok) {
|
||||
print_stress_report(&report, device_index, phase_duration);
|
||||
phase_ok = 1;
|
||||
} else {
|
||||
printf("phase_error=%s\n", phase_name);
|
||||
if (report.details[0] != '\0') {
|
||||
printf("%s", report.details);
|
||||
if (report.details[strlen(report.details) - 1] != '\n') {
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
printf("status=FAILED\n");
|
||||
}
|
||||
printf("phase_end=%s\n", phase_name);
|
||||
fflush(stdout);
|
||||
}
|
||||
free(plan_seconds_copy);
|
||||
free(plan_copy);
|
||||
return phase_ok ? 0 : 1;
|
||||
}
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
||||
#endif
|
||||
if (!ok) {
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
|
||||
if (precision_filter != NULL) {
|
||||
fprintf(stderr,
|
||||
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
|
||||
precision_filter,
|
||||
name,
|
||||
cc_major,
|
||||
cc_minor);
|
||||
return 1;
|
||||
}
|
||||
int ptx_mb = size_mb;
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("device=%s\n", report.device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
||||
printf("backend=%s\n", report.backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
||||
printf("streams=%d\n", report.stream_count);
|
||||
printf("iterations=%lu\n", report.iterations);
|
||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
||||
if (report.details[0] != '\0') {
|
||||
printf("%s", report.details);
|
||||
}
|
||||
printf("status=OK\n");
|
||||
print_stress_report(&report, device_index, seconds);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -161,6 +161,7 @@ run_variant() {
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-e BEE_REQUIRE_MEMTEST=1 \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||
@@ -175,6 +176,7 @@ run_variant() {
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-e BEE_REQUIRE_MEMTEST=1 \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||
|
||||
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export MEMTEST_VERSION
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
@@ -775,6 +776,7 @@ run_optional_step_sh() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
mkdir -p "${LOG_DIR}" 2>/dev/null || true
|
||||
step_log="${LOG_DIR}/${step_slug}.log"
|
||||
echo ""
|
||||
echo "=== optional step: ${step_name} ==="
|
||||
@@ -798,13 +800,14 @@ start_build_log
|
||||
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
||||
echo "=== refreshing apt index to detect current kernel ABI ==="
|
||||
apt-get update -qq
|
||||
apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
|
||||
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
||||
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
||||
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
||||
| head -1)
|
||||
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
||||
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
||||
echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
||||
@@ -873,6 +876,22 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
if [ -n "$fp4_type_match" ]; then
|
||||
echo "fp4_header_symbol=present"
|
||||
echo "$fp4_type_match"
|
||||
else
|
||||
echo "fp4_header_symbol=missing"
|
||||
fi
|
||||
if [ -n "$fp4_scale_match" ]; then
|
||||
echo "fp4_scale_mode_symbol=present"
|
||||
echo "$fp4_scale_match"
|
||||
else
|
||||
echo "fp4_scale_mode_symbol=missing"
|
||||
fi
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
@@ -901,6 +920,12 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
else
|
||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||
fi
|
||||
echo "=== bee-gpu-burn compiled profile probe ==="
|
||||
if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
|
||||
echo "fp4_profile_string=present"
|
||||
else
|
||||
echo "fp4_profile_string=missing"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||
@@ -1237,6 +1262,7 @@ fi
|
||||
# --- substitute version placeholders in package list and archive ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
sed -i \
|
||||
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
@@ -1279,7 +1305,7 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
|
||||
cd "${LB_DIR}"
|
||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
|
||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||
|
||||
@@ -16,6 +16,11 @@ menuentry "EASY-BEE" {
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
@@ -26,6 +31,11 @@ submenu "EASY-BEE (advanced options) -->" {
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS + GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
|
||||
menu default
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^graphics/KMS)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-gsp-off
|
||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^KMS, no nomodeset)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (KMS, ^GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
|
||||
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||
# Enable GPU-vendor specific services
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||
systemctl enable bee-nvidia.service
|
||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||
@@ -62,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/bin/sh
|
||||
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
||||
set -e
|
||||
echo "=== generating bee wallpaper ==="
|
||||
mkdir -p /usr/share/bee
|
||||
|
||||
python3 - <<'PYEOF'
|
||||
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||
import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
ASCII_ART = [
|
||||
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||
]
|
||||
SUBTITLE = " Hardware Audit LiveCD"
|
||||
|
||||
FG = (0xF6, 0xD0, 0x47)
|
||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||
SHADOW = (0x5E, 0x47, 0x05)
|
||||
SUB = (0x96, 0x7A, 0x17)
|
||||
BG = (0x05, 0x05, 0x05)
|
||||
|
||||
MONO_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||
]
|
||||
SUB_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||
]
|
||||
|
||||
|
||||
def load_font(candidates, size):
|
||||
for path in candidates:
|
||||
if os.path.exists(path):
|
||||
return ImageFont.truetype(path, size)
|
||||
return ImageFont.load_default()
|
||||
|
||||
|
||||
def mono_metrics(font):
|
||||
probe = Image.new('L', (W, H), 0)
|
||||
draw = ImageDraw.Draw(probe)
|
||||
char_w = int(round(draw.textlength("M", font=font)))
|
||||
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||
char_h = bb[3] - bb[1]
|
||||
return char_w, char_h
|
||||
|
||||
|
||||
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||
width = max(len(line) for line in lines) * char_w
|
||||
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||
mask = Image.new('L', (width, height), 0)
|
||||
draw = ImageDraw.Draw(mask)
|
||||
for row, line in enumerate(lines):
|
||||
y = row * (char_h + line_gap)
|
||||
for col, ch in enumerate(line):
|
||||
if ch == ' ':
|
||||
continue
|
||||
x = col * char_w
|
||||
draw.text((x, y), ch, font=font, fill=255)
|
||||
return mask
|
||||
|
||||
|
||||
img = Image.new('RGB', (W, H), BG)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Soft amber glow under the logo without depending on font rendering.
|
||||
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||
glow_draw = ImageDraw.Draw(glow)
|
||||
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||
|
||||
TARGET_LOGO_W = 400
|
||||
max_chars = max(len(line) for line in ASCII_ART)
|
||||
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||
_probe_cw, _ = mono_metrics(_probe_font)
|
||||
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||
char_w, char_h = mono_metrics(font_logo)
|
||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||
logo_w, logo_h = logo_mask.size
|
||||
logo_x = (W - logo_w) // 2
|
||||
logo_y = 380
|
||||
|
||||
sh_off = max(1, font_size_logo // 6)
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||
|
||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||
sub_y = logo_y + logo_h + 48
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||
|
||||
img = img.convert('RGB')
|
||||
|
||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||
PYEOF
|
||||
|
||||
echo "=== wallpaper done ==="
|
||||
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/sh
|
||||
# 9011-toram-rsync.hook.chroot
|
||||
#
|
||||
# Adds rsync to the initramfs so that live-boot's toram code takes the
|
||||
# rsync --progress path instead of the silent "cp -a" fallback.
|
||||
#
|
||||
# live-boot's 9990-toram-todisk.sh already contains:
|
||||
# if [ -x /bin/rsync ]; then
|
||||
# rsync -a --progress ... 1>/dev/console
|
||||
# else
|
||||
# cp -a ... # no output
|
||||
# fi
|
||||
#
|
||||
# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
|
||||
# which copies the binary + all shared-library dependencies into the initrd.
|
||||
|
||||
set -e
|
||||
|
||||
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||
HOOK="${HOOK_DIR}/bee-rsync"
|
||||
|
||||
mkdir -p "${HOOK_DIR}"
|
||||
|
||||
cat > "${HOOK}" << 'EOF'
|
||||
#!/bin/sh
|
||||
# initramfs hook: include rsync for live-boot toram progress output
|
||||
PREREQ=""
|
||||
prereqs() { echo "$PREREQ"; }
|
||||
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||
|
||||
. /usr/share/initramfs-tools/hook-functions
|
||||
|
||||
if [ -x /usr/bin/rsync ]; then
|
||||
copy_exec /usr/bin/rsync /bin
|
||||
fi
|
||||
EOF
|
||||
|
||||
chmod +x "${HOOK}"
|
||||
|
||||
echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
|
||||
|
||||
# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
|
||||
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||
echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
|
||||
update-initramfs -u -k "${KVER}"
|
||||
echo "9011-toram-rsync: done"
|
||||
@@ -5,6 +5,8 @@ set -e
|
||||
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
|
||||
# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
|
||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||
BINARY_BOOT_DIR="binary/boot"
|
||||
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||
@@ -24,15 +26,23 @@ fail_or_warn() {
|
||||
return 0
|
||||
}
|
||||
|
||||
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
|
||||
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
|
||||
# The template already has memtest entries hardcoded, so a missing config file
|
||||
# here is not an error; validate_iso_memtest() checks the final ISO instead.
|
||||
warn_only() {
|
||||
log "WARNING: $1"
|
||||
}
|
||||
|
||||
copy_memtest_file() {
|
||||
src="$1"
|
||||
base="$(basename "$src")"
|
||||
dst="${BINARY_BOOT_DIR}/${base}"
|
||||
dst_name="${2:-$(basename "$src")}"
|
||||
dst="${BINARY_BOOT_DIR}/${dst_name}"
|
||||
|
||||
[ -f "$src" ] || return 1
|
||||
mkdir -p "${BINARY_BOOT_DIR}"
|
||||
cp "$src" "$dst"
|
||||
log "copied ${base} from ${src}"
|
||||
log "copied ${dst_name} from ${src}"
|
||||
}
|
||||
|
||||
extract_memtest_from_deb() {
|
||||
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {
|
||||
|
||||
log "extracting memtest payload from ${deb}"
|
||||
dpkg-deb -x "$deb" "$tmpdir"
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ -f "${tmpdir}/boot/${f}" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/${f}"
|
||||
fi
|
||||
done
|
||||
|
||||
# EFI binary: both 5.x and 6.x use memtest86+x64.efi
|
||||
if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
|
||||
fi
|
||||
|
||||
# BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
|
||||
if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
|
||||
elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
|
||||
fi
|
||||
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
|
||||
download_and_extract_memtest() {
|
||||
tmpdl="$(mktemp -d)"
|
||||
if [ -n "${MEMTEST_VERSION:-}" ]; then
|
||||
pkg_spec="memtest86+=${MEMTEST_VERSION}"
|
||||
else
|
||||
pkg_spec="memtest86+"
|
||||
fi
|
||||
log "downloading ${pkg_spec} from apt"
|
||||
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
|
||||
log "apt download failed, retrying after apt-get update"
|
||||
apt-get update -qq >/dev/null 2>&1 || true
|
||||
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
|
||||
fi
|
||||
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||
if [ -n "$deb" ]; then
|
||||
extract_memtest_from_deb "$deb"
|
||||
else
|
||||
log "apt download of memtest86+ failed"
|
||||
fi
|
||||
rm -rf "$tmpdl"
|
||||
}
|
||||
|
||||
ensure_memtest_binaries() {
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 1. Try files already placed by lb binary_memtest or chroot
|
||||
for root in chroot/boot /boot; do
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||
done
|
||||
# 6.x BIOS binary may lack x64 in name — copy with normalised name
|
||||
if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
|
||||
copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
|
||||
fi
|
||||
done
|
||||
|
||||
missing=0
|
||||
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 2. Try apt package cache (may be empty if lb binary_memtest already purged)
|
||||
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||
[ -d "$root" ] || continue
|
||||
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
|
||||
break
|
||||
done
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
# 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
|
||||
download_and_extract_memtest
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {
|
||||
|
||||
ensure_grub_entry() {
|
||||
[ -f "$GRUB_CFG" ] || {
|
||||
fail_or_warn "missing ${GRUB_CFG}"
|
||||
warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -114,7 +169,7 @@ EOF
|
||||
|
||||
ensure_isolinux_entry() {
|
||||
[ -f "$ISOLINUX_CFG" ] || {
|
||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
||||
warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||
# explicitly.
|
||||
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
|
||||
@@ -3,6 +3,7 @@ dmidecode
|
||||
smartmontools
|
||||
nvme-cli
|
||||
pciutils
|
||||
rsync
|
||||
ipmitool
|
||||
util-linux
|
||||
e2fsprogs
|
||||
|
||||
@@ -7,10 +7,12 @@ SIZE_MB=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
PRECISION=""
|
||||
PRECISION_PLAN=""
|
||||
PRECISION_PLAN_SECONDS=""
|
||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision fp8|fp16|fp32|fp64|fp4]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -32,6 +34,8 @@ while [ "$#" -gt 0 ]; do
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
|
||||
--precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
|
||||
--precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
@@ -92,8 +96,12 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||
precision_arg=""
|
||||
[ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
|
||||
precision_plan_arg=""
|
||||
[ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
|
||||
precision_plan_seconds_arg=""
|
||||
[ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" \
|
||||
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} >"${log}" 2>&1 &
|
||||
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
|
||||
@@ -65,6 +65,9 @@ done
|
||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
||||
echo " The live medium may have been disconnected." >&2
|
||||
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
||||
echo " Then re-run bee-install." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -162,10 +165,59 @@ log " Mounted."
|
||||
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
||||
log " Source: $SQUASHFS"
|
||||
log " Target: $MOUNT_ROOT"
|
||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||
grep -E '^\[|^inod|^created|^extract' | \
|
||||
while read -r line; do log " $line"; done || true
|
||||
log " Unpack complete."
|
||||
|
||||
# unsquashfs does not support resume, so retry the entire unpack step if the
|
||||
# source medium disappears mid-copy (e.g. CD physically disconnected).
|
||||
UNPACK_ATTEMPTS=0
|
||||
UNPACK_MAX=5
|
||||
while true; do
|
||||
UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
|
||||
if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
|
||||
die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
|
||||
fi
|
||||
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
||||
|
||||
# Re-check squashfs is reachable before each attempt
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
log " SOURCE LOST: $SQUASHFS not found."
|
||||
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
||||
log " then press Enter here to retry."
|
||||
read -r _
|
||||
continue
|
||||
fi
|
||||
|
||||
# wipe partial unpack so unsquashfs starts clean
|
||||
if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
|
||||
log " Cleaning partial unpack from $MOUNT_ROOT ..."
|
||||
# keep the mount point itself but remove its contents
|
||||
find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
|
||||
fi
|
||||
|
||||
UNPACK_OK=0
|
||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||
|
||||
# Check squashfs is still reachable (gone = disc pulled during copy)
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
log " WARNING: source medium lost during unpack — will retry after remount."
|
||||
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
||||
read -r _
|
||||
continue
|
||||
fi
|
||||
|
||||
# Verify the unpack produced a usable root (presence of /etc is a basic check)
|
||||
if [ -d "${MOUNT_ROOT}/etc" ]; then
|
||||
log " Unpack complete."
|
||||
break
|
||||
else
|
||||
log " WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
|
||||
if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
|
||||
log " Retrying in 5 s ..."
|
||||
sleep 5
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
log "--- Step 6/7: Configuring installed system ---"
|
||||
|
||||
@@ -258,6 +258,22 @@ else
|
||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||
fi
|
||||
|
||||
# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
|
||||
# systems CUDA/DCGM can report "system not yet initialized" until fabric
|
||||
# training completes under nvidia-fabricmanager.
|
||||
if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
|
||||
if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager restarted"
|
||||
elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
|
||||
log "nvidia-fabricmanager started"
|
||||
else
|
||||
log "WARN: failed to start nvidia-fabricmanager.service"
|
||||
systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/ fabricmanager: /' || true
|
||||
fi
|
||||
else
|
||||
log "WARN: nvidia-fabricmanager.service not installed"
|
||||
fi
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||
|
||||
@@ -9,9 +9,9 @@ xset s noblank
|
||||
|
||||
# Set desktop background.
|
||||
if [ -f /usr/share/bee/wallpaper.png ]; then
|
||||
feh --bg-fill /usr/share/bee/wallpaper.png
|
||||
feh --bg-center --image-bg '#000000' /usr/share/bee/wallpaper.png
|
||||
else
|
||||
xsetroot -solid '#f6c90e'
|
||||
xsetroot -solid '#000000'
|
||||
fi
|
||||
|
||||
tint2 &
|
||||
|
||||
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
100
iso/overlay/usr/local/bin/bee-remount-medium
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
||||
#
|
||||
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
||||
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
|
||||
#
|
||||
# Usage: bee-remount-medium [--wait]
|
||||
# --wait keep retrying every 5 seconds until the medium is found (useful
|
||||
# while physically reconnecting the device)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MEDIUM_DIR="/run/live/medium"
|
||||
SQUASHFS_REL="live/filesystem.squashfs"
|
||||
WAIT_MODE=0
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--wait|-w) WAIT_MODE=1 ;;
|
||||
--help|-h)
|
||||
echo "Usage: bee-remount-medium [--wait]"
|
||||
echo " Finds and remounts the live ISO medium to $MEDIUM_DIR"
|
||||
echo " --wait retry every 5 s until a medium with squashfs is found"
|
||||
exit 0 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
die() { log "ERROR: $*" >&2; exit 1; }
|
||||
|
||||
# Return all candidate block devices (optical + removable USB mass storage)
|
||||
find_candidates() {
|
||||
# CD/DVD drives
|
||||
for dev in /dev/sr* /dev/scd*; do
|
||||
[ -b "$dev" ] && echo "$dev"
|
||||
done
|
||||
# USB/removable disks and partitions
|
||||
for dev in /dev/sd* /dev/vd*; do
|
||||
[ -b "$dev" ] || continue
|
||||
# Only whole disks or partitions — skip the same device we are running from
|
||||
local removable
|
||||
local base
|
||||
base=$(basename "$dev")
|
||||
removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
|
||||
[ "$removable" = "1" ] && echo "$dev"
|
||||
done
|
||||
}
|
||||
|
||||
# Try to mount $1 to $MEDIUM_DIR and check for squashfs
|
||||
try_mount() {
|
||||
local dev="$1"
|
||||
local tmpdir
|
||||
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
||||
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
||||
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
|
||||
# Unmount probe mount and mount properly onto live path
|
||||
umount "$tmpdir" 2>/dev/null || true
|
||||
rmdir "$tmpdir" 2>/dev/null || true
|
||||
# Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
|
||||
umount "$MEDIUM_DIR" 2>/dev/null || true
|
||||
mkdir -p "$MEDIUM_DIR"
|
||||
if mount -o ro "$dev" "$MEDIUM_DIR"; then
|
||||
log "Mounted $dev on $MEDIUM_DIR"
|
||||
return 0
|
||||
else
|
||||
log "Mount of $dev on $MEDIUM_DIR failed"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
umount "$tmpdir" 2>/dev/null || true
|
||||
fi
|
||||
rmdir "$tmpdir" 2>/dev/null || true
|
||||
return 1
|
||||
}
|
||||
|
||||
attempt() {
|
||||
log "Scanning for ISO medium..."
|
||||
for dev in $(find_candidates); do
|
||||
log " Trying $dev ..."
|
||||
if try_mount "$dev"; then
|
||||
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
|
||||
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
if [ "$WAIT_MODE" = "1" ]; then
|
||||
log "Waiting for live medium (press Ctrl+C to abort)..."
|
||||
while true; do
|
||||
if attempt; then
|
||||
exit 0
|
||||
fi
|
||||
log " Not found — retrying in 5 s (reconnect the disc now)"
|
||||
sleep 5
|
||||
done
|
||||
else
|
||||
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
|
||||
fi
|
||||
BIN
iso/overlay/usr/share/bee/wallpaper.png
Normal file
BIN
iso/overlay/usr/share/bee/wallpaper.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
Reference in New Issue
Block a user