Compare commits
184 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fc9b446d2e | |||
|
|
ea68318744 | ||
|
|
518082c2e2 | ||
|
|
056dce0b98 | ||
|
|
24f2e65b6e | ||
|
|
7f27b9aa38 | ||
|
|
cf29131116 | ||
|
|
13e6324853 | ||
|
|
892ef6fb7d | ||
|
|
ce46a97975 | ||
|
|
258ecb3453 | ||
|
|
cbb0d1e522 | ||
|
|
bab941ccf1 | ||
|
|
b49c71a980 | ||
|
|
85d1acdaa3 | ||
|
|
a2d7513153 | ||
|
|
5b5d8609d3 | ||
|
|
e7442972d1 | ||
|
|
4c6daa1c5e | ||
|
|
e420888d71 | ||
|
|
8149360410 | ||
|
|
4262c5b798 | ||
|
|
b2e177af31 | ||
|
|
271dadda03 | ||
|
|
20766ccc76 | ||
|
|
966944d6d8 | ||
| ce6b1e0eb7 | |||
| 4066e842a9 | |||
| 7d2e904d14 | |||
| 2320925433 | |||
| e169a7722c | |||
| 74a3c65f64 | |||
| 884988cb2a | |||
| 963bc960ca | |||
| 4f6579e040 | |||
| dc07580adc | |||
|
|
87e78e230e | ||
|
|
805a3b277d | ||
|
|
5bc9bd7fb3 | ||
|
|
0939a647ea | ||
|
|
7640f20714 | ||
|
|
1593bf3e76 | ||
|
|
ae80d7711e | ||
|
|
ca78b9df65 | ||
|
|
5cafe63f33 | ||
|
|
b75e65bcb1 | ||
|
|
8d173175eb | ||
|
|
5cbde0448e | ||
|
|
49a09fde05 | ||
|
|
f3962422c8 | ||
|
|
ee36e3c711 | ||
|
|
cca3b21d35 | ||
|
|
75c33e073e | ||
| 7b4bcc745a | |||
| 42774d44a6 | |||
| 5dc022ddf8 | |||
| 6623e159f5 | |||
| bbd6d009f8 | |||
| 6c2b188ec9 | |||
| 14505ef24a | |||
| 4f20c9246d | |||
| eed157c2db | |||
| a2c8aea0df | |||
| b21f03cd26 | |||
| cac5b9c86e | |||
| b5d04ef045 | |||
| fcd64438ea | |||
| 0e39e7d960 | |||
|
|
58d6da0e4f | ||
|
|
7ce73e34a4 | ||
|
|
8a21809ade | ||
|
|
626763e31d | ||
|
|
0b8a2ff83f | ||
|
|
2c22b01fe3 | ||
|
|
ec89616585 | ||
|
|
c0dbbf96ad | ||
|
|
76484b123c | ||
|
|
8901596152 | ||
|
|
7c504e5056 | ||
|
|
333c44f3ba | ||
|
|
3bca821d3e | ||
|
|
3648e37a1e | ||
|
|
d109e08fab | ||
|
|
11d00b9442 | ||
|
|
6defa5ae15 | ||
|
|
c76658ed00 | ||
|
|
2163017a98 | ||
| 29179917c3 | |||
| be4b439804 | |||
| 749fc8a94d | |||
| 6112094d45 | |||
| e9a2bc9f9d | |||
|
|
7a8f884664 | ||
|
|
8bf8dfa45b | ||
|
|
6a22199aff | ||
|
|
ddb2bb5d1c | ||
|
|
aa284ae754 | ||
|
|
8512098174 | ||
|
|
6b5d22c194 | ||
|
|
a35e90a93e | ||
|
|
1ced81707f | ||
|
|
679aeb9947 | ||
|
|
647e99b697 | ||
|
|
4af997f436 | ||
|
|
6caace0cc0 | ||
|
|
5f0103635b | ||
|
|
84a2551dc0 | ||
|
|
1cfabc9230 | ||
|
|
5dc711de23 | ||
|
|
ab802719f8 | ||
|
|
a94e8007f8 | ||
| c69bf07b27 | |||
| b3cf8e3893 | |||
| 17118298bd | |||
| 65bcc9ce81 | |||
| 0cdfbc5875 | |||
| cf9b54b600 | |||
| 0bfb3fe954 | |||
| 3053cb0710 | |||
| 2038489961 | |||
| e35484013e | |||
| 2cdf034bb0 | |||
| b89580c24d | |||
| df1385d3d6 | |||
| f8cd9a7376 | |||
| d52ec67f8f | |||
| 61c7abaa80 | |||
| d60f7758ba | |||
| 52c3a24b76 | |||
| 028bb30333 | |||
| 7d64e5d215 | |||
| 51b721aeb3 | |||
| bac89bb6e5 | |||
| 7a618da1f9 | |||
| 64ae1c0ff0 | |||
| 49050ca717 | |||
| 5ba72ab315 | |||
| 63363e9629 | |||
|
|
5285c0d101 | ||
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c | ||
|
|
18e24a9aa5 | ||
|
|
e306250da7 | ||
|
|
c5b2081ac9 | ||
| 434528083e | |||
| 30aa30cd67 | |||
| 4f76e1de21 | |||
| 3732e64a4a | |||
| 0d925299ff | |||
| a8d5e019a5 | |||
| 72ec086568 | |||
| 7a0b0934df | |||
| d8ca0dca2c | |||
| d90250f80a | |||
| 8d6eaef5de | |||
| 732bf4cbab | |||
| fa6d905a10 | |||
|
|
5c1862ce4c | ||
|
|
b65ef2ea1d | ||
|
|
533d703c97 | ||
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d | |||
| a6a07f2626 | |||
| f87461ee4a | |||
| a636146dbd | |||
|
|
303de2df04 | ||
|
|
95124d228f | ||
|
|
54338dbae5 | ||
|
|
2be7ae6d28 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,5 @@
|
||||
.env
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
audit/bee
|
||||
|
||||
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -63,14 +64,20 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||
return runExport(args[1:], stdout, stderr)
|
||||
case "preflight":
|
||||
return runPreflight(args[1:], stdout, stderr)
|
||||
case "install-to-ram":
|
||||
return runInstallToRAM(args[1:], stdout, stderr)
|
||||
case "support-bundle":
|
||||
return runSupportBundle(args[1:], stdout, stderr)
|
||||
case "web":
|
||||
return runWeb(args[1:], stdout, stderr)
|
||||
case "blackbox":
|
||||
return runBlackbox(args[1:], stdout, stderr)
|
||||
case "sat":
|
||||
return runSAT(args[1:], stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark(args[1:], stdout, stderr)
|
||||
case "bee-worker":
|
||||
return runBeeWorker(args[1:], stdout, stderr)
|
||||
case "version", "--version", "-version":
|
||||
fmt.Fprintln(stdout, Version)
|
||||
return 0
|
||||
@@ -85,11 +92,14 @@ func printRootUsage(w io.Writer) {
|
||||
fmt.Fprintln(w, `bee commands:
|
||||
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||
bee preflight --output stdout|file:<path>
|
||||
bee install-to-ram
|
||||
bee export --target <device>
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||
bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||
bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
|
||||
bee version
|
||||
bee help [command]`)
|
||||
}
|
||||
@@ -102,14 +112,20 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
return runExport([]string{"--help"}, stdout, stdout)
|
||||
case "preflight":
|
||||
return runPreflight([]string{"--help"}, stdout, stdout)
|
||||
case "install-to-ram":
|
||||
return runInstallToRAM([]string{"--help"}, stdout, stdout)
|
||||
case "support-bundle":
|
||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||
case "web":
|
||||
return runWeb([]string{"--help"}, stdout, stdout)
|
||||
case "blackbox":
|
||||
return runBlackbox([]string{"--help"}, stdout, stdout)
|
||||
case "sat":
|
||||
return runSAT([]string{"--help"}, stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||
case "bee-worker":
|
||||
return runBeeWorker([]string{"--help"}, stdout, stderr)
|
||||
case "version":
|
||||
fmt.Fprintln(stdout, "usage: bee version")
|
||||
return 0
|
||||
@@ -241,6 +257,32 @@ func runPreflight(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runInstallToRAM(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("install-to-ram", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee install-to-ram")
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
|
||||
application := app.New(platform.New())
|
||||
logLine := func(s string) { fmt.Fprintln(stdout, s) }
|
||||
if err := application.RunInstallToRAM(context.Background(), logLine); err != nil {
|
||||
slog.Error("run install-to-ram", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
@@ -335,6 +377,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBlackbox(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||
statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
|
||||
if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
|
||||
slog.Error("run blackbox", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
@@ -462,6 +531,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBeeWorker(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
|
||||
taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
|
||||
}
|
||||
|
||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
|
||||
@@ -5,22 +5,18 @@ go 1.25.0
|
||||
replace reanimator/chart => ../internal/chart
|
||||
|
||||
require (
|
||||
github.com/go-analyze/charts v0.5.26
|
||||
modernc.org/sqlite v1.48.0
|
||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/image v0.24.0 // indirect
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
modernc.org/libc v1.70.0 // indirect
|
||||
modernc.org/libc v1.72.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
modernc.org/sqlite v1.48.0 // indirect
|
||||
)
|
||||
|
||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
405
audit/internal/app/app_format.go
Normal file
405
audit/internal/app/app_format.go
Normal file
@@ -0,0 +1,405 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func hostnameOr(fallback string) string {
|
||||
hn, err := os.Hostname()
|
||||
if err != nil || strings.TrimSpace(hn) == "" {
|
||||
return fallback
|
||||
}
|
||||
return hn
|
||||
}
|
||||
|
||||
func sanitizeFilename(v string) string {
|
||||
var out []rune
|
||||
for _, r := range v {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.':
|
||||
out = append(out, r)
|
||||
default:
|
||||
out = append(out, '-')
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return "unknown"
|
||||
}
|
||||
return string(out)
|
||||
}
|
||||
|
||||
func bodyOr(body, fallback string) string {
|
||||
body = strings.TrimSpace(body)
|
||||
if body == "" {
|
||||
return fallback
|
||||
}
|
||||
return body
|
||||
}
|
||||
|
||||
func trimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func joinSortedKeys(values map[string]struct{}) string {
|
||||
if len(values) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(values))
|
||||
for key := range values {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return strings.Join(keys, "/")
|
||||
}
|
||||
|
||||
func humanizeMB(totalMB int) string {
|
||||
if totalMB <= 0 {
|
||||
return ""
|
||||
}
|
||||
gb := float64(totalMB) / 1024.0
|
||||
if gb >= 1024.0 {
|
||||
tb := gb / 1024.0
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
if gb == float64(int64(gb)) {
|
||||
return fmt.Sprintf("%.0f GB", gb)
|
||||
}
|
||||
return fmt.Sprintf("%.1f GB", gb)
|
||||
}
|
||||
|
||||
func humanizeGB(totalGB int) string {
|
||||
if totalGB <= 0 {
|
||||
return ""
|
||||
}
|
||||
tb := float64(totalGB) / 1024.0
|
||||
if tb >= 1.0 {
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
return fmt.Sprintf("%d GB", totalGB)
|
||||
}
|
||||
|
||||
func parseKeyValueSummary(raw string) map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
key, value, ok := strings.Cut(line, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
out[strings.TrimSpace(key)] = strings.TrimSpace(value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func cleanSummaryKey(key string) string {
|
||||
idx := strings.Index(key, "-")
|
||||
if idx <= 0 {
|
||||
return key
|
||||
}
|
||||
prefix := key[:idx]
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return key[idx+1:]
|
||||
}
|
||||
|
||||
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
// Exclude Aspeed BMC VGA adapters (not compute GPUs).
|
||||
if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID {
|
||||
return false
|
||||
}
|
||||
class := trimPtr(dev.DeviceClass)
|
||||
// AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController.
|
||||
// Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID.
|
||||
if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" {
|
||||
return true
|
||||
}
|
||||
// NVIDIA devices sometimes expose class values outside the standard GPU set.
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||
}
|
||||
|
||||
func formatSystemLine(board schema.HardwareBoard) string {
|
||||
model := strings.TrimSpace(strings.Join([]string{
|
||||
trimPtr(board.Manufacturer),
|
||||
trimPtr(board.ProductName),
|
||||
}, " "))
|
||||
serial := strings.TrimSpace(board.SerialNumber)
|
||||
switch {
|
||||
case model != "" && serial != "":
|
||||
return fmt.Sprintf("System: %s | S/N %s", model, serial)
|
||||
case model != "":
|
||||
return "System: " + model
|
||||
case serial != "":
|
||||
return "System S/N: " + serial
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func formatCPULine(cpus []schema.HardwareCPU) string {
|
||||
if len(cpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
modelCounts := map[string]int{}
|
||||
unknown := 0
|
||||
for _, cpu := range cpus {
|
||||
model := trimPtr(cpu.Model)
|
||||
if model == "" {
|
||||
unknown++
|
||||
continue
|
||||
}
|
||||
modelCounts[model]++
|
||||
}
|
||||
if len(modelCounts) == 1 && unknown == 0 {
|
||||
for model, count := range modelCounts {
|
||||
return fmt.Sprintf("CPU: %d x %s", count, model)
|
||||
}
|
||||
}
|
||||
parts := make([]string, 0, len(modelCounts)+1)
|
||||
if len(modelCounts) > 0 {
|
||||
keys := make([]string, 0, len(modelCounts))
|
||||
for key := range modelCounts {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
|
||||
}
|
||||
}
|
||||
if unknown > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
|
||||
}
|
||||
return "CPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatMemoryLine(dimms []schema.HardwareMemory) string {
|
||||
totalMB := 0
|
||||
present := 0
|
||||
types := map[string]struct{}{}
|
||||
for _, dimm := range dimms {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
|
||||
continue
|
||||
}
|
||||
present++
|
||||
totalMB += *dimm.SizeMB
|
||||
if value := trimPtr(dimm.Type); value != "" {
|
||||
types[value] = struct{}{}
|
||||
}
|
||||
}
|
||||
if totalMB == 0 {
|
||||
return ""
|
||||
}
|
||||
typeText := joinSortedKeys(types)
|
||||
line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
|
||||
if typeText != "" {
|
||||
line += " " + typeText
|
||||
}
|
||||
if present > 0 {
|
||||
line += fmt.Sprintf(" (%d DIMMs)", present)
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatStorageLine(disks []schema.HardwareStorage) string {
|
||||
count := 0
|
||||
totalGB := 0
|
||||
for _, disk := range disks {
|
||||
if disk.Present != nil && !*disk.Present {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
if disk.SizeGB != nil && *disk.SizeGB > 0 {
|
||||
totalGB += *disk.SizeGB
|
||||
}
|
||||
}
|
||||
if count == 0 {
|
||||
return ""
|
||||
}
|
||||
line := fmt.Sprintf("Storage: %d drives", count)
|
||||
if totalGB > 0 {
|
||||
line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatGPULine(devices []schema.HardwarePCIeDevice) string {
|
||||
gpus := map[string]int{}
|
||||
for _, dev := range devices {
|
||||
if !isGPUDevice(dev) {
|
||||
continue
|
||||
}
|
||||
name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
|
||||
gpus[name]++
|
||||
}
|
||||
if len(gpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(gpus))
|
||||
for key := range gpus {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
|
||||
}
|
||||
return "GPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
||||
if list == nil {
|
||||
return ""
|
||||
}
|
||||
ifaces, err := list()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
var ips []string
|
||||
for _, iface := range ifaces {
|
||||
for _, ip := range iface.IPv4 {
|
||||
ip = strings.TrimSpace(ip)
|
||||
if ip == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[ip]; ok {
|
||||
continue
|
||||
}
|
||||
seen[ip] = struct{}{}
|
||||
ips = append(ips, ip)
|
||||
}
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
return ""
|
||||
}
|
||||
sort.Strings(ips)
|
||||
return "IP: " + strings.Join(ips, ", ")
|
||||
}
|
||||
|
||||
func formatSATDetail(raw string) string {
|
||||
var b strings.Builder
|
||||
kv := parseKeyValueSummary(raw)
|
||||
|
||||
if t, ok := kv["run_at_utc"]; ok {
|
||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||
}
|
||||
|
||||
lines := strings.Split(raw, "\n")
|
||||
var stepKeys []string
|
||||
seenStep := map[string]bool{}
|
||||
for _, line := range lines {
|
||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||
key := line[:idx]
|
||||
if !seenStep[key] && key != "overall" {
|
||||
seenStep[key] = true
|
||||
stepKeys = append(stepKeys, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range stepKeys {
|
||||
status := kv[key+"_status"]
|
||||
display := cleanSummaryKey(key)
|
||||
switch status {
|
||||
case "OK":
|
||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||
case "FAILED":
|
||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||
case "UNSUPPORTED":
|
||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||
default:
|
||||
fmt.Fprintf(&b, "? %s\n", display)
|
||||
}
|
||||
}
|
||||
|
||||
if overall, ok := kv["overall_status"]; ok {
|
||||
ok2 := kv["job_ok"]
|
||||
failed := kv["job_failed"]
|
||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func formatSATSummary(label, raw string) string {
|
||||
values := parseKeyValueSummary(raw)
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s:", label)
|
||||
if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
|
||||
fmt.Fprintf(&body, " %s", overall)
|
||||
}
|
||||
if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
|
||||
fmt.Fprintf(&body, " ok=%s", ok)
|
||||
}
|
||||
if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
|
||||
fmt.Fprintf(&body, " failed=%s", failed)
|
||||
}
|
||||
if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
|
||||
fmt.Fprintf(&body, " unsupported=%s", unsupported)
|
||||
}
|
||||
if devices := strings.TrimSpace(values["devices"]); devices != "" {
|
||||
fmt.Fprintf(&body, "\nDevices: %s", devices)
|
||||
}
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func latestSATSummaries() []string {
|
||||
patterns := []struct {
|
||||
label string
|
||||
prefix string
|
||||
}{
|
||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||
{label: "Memory SAT", prefix: "memory-"},
|
||||
{label: "Storage SAT", prefix: "storage-"},
|
||||
{label: "CPU SAT", prefix: "cpu-"},
|
||||
}
|
||||
var out []string
|
||||
for _, item := range patterns {
|
||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, formatSATSummary(item.label, string(raw)))
|
||||
}
|
||||
return out
|
||||
}
|
||||
76
audit/internal/app/app_install.go
Normal file
76
audit/internal/app/app_install.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||
return a.exports.ListRemovableTargets()
|
||||
}
|
||||
|
||||
func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) {
|
||||
if _, err := os.Stat(DefaultAuditJSONPath); err != nil {
|
||||
return "", err
|
||||
}
|
||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(tmpPath)
|
||||
return a.exports.ExportFileToTarget(tmpPath, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportLatestAudit(target)
|
||||
body := "Audit export failed."
|
||||
if err == nil {
|
||||
body = "Audit exported."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Audit exported to " + path
|
||||
}
|
||||
return ActionResult{Title: "Export audit", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
|
||||
archive, err := BuildSupportBundle(DefaultExportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(archive)
|
||||
return a.exports.ExportFileToTarget(archive, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle export failed."
|
||||
if err == nil {
|
||||
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||
return a.installer.ListInstallDisks()
|
||||
}
|
||||
|
||||
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||
}
|
||||
106
audit/internal/app/app_network.go
Normal file
106
audit/internal/app/app_network.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||
return a.network.ListInterfaces()
|
||||
}
|
||||
|
||||
func (a *App) DefaultRoute() string {
|
||||
return a.network.DefaultRoute()
|
||||
}
|
||||
|
||||
func (a *App) DHCPOne(iface string) (string, error) {
|
||||
return a.network.DHCPOne(iface)
|
||||
}
|
||||
|
||||
func (a *App) DHCPOneResult(iface string) (ActionResult, error) {
|
||||
body, err := a.network.DHCPOne(iface)
|
||||
return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) DHCPAll() (string, error) {
|
||||
return a.network.DHCPAll()
|
||||
}
|
||||
|
||||
func (a *App) DHCPAllResult() (ActionResult, error) {
|
||||
body, err := a.network.DHCPAll()
|
||||
return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||
return a.network.SetStaticIPv4(cfg)
|
||||
}
|
||||
|
||||
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||
return a.network.SetInterfaceState(iface, up)
|
||||
}
|
||||
|
||||
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||
return a.network.GetInterfaceState(iface)
|
||||
}
|
||||
|
||||
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return a.network.CaptureNetworkSnapshot()
|
||||
}
|
||||
|
||||
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||
body, err := a.network.SetStaticIPv4(cfg)
|
||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||
}
|
||||
|
||||
func (a *App) NetworkStatus() (ActionResult, error) {
|
||||
ifaces, err := a.network.ListInterfaces()
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Network status"}, err
|
||||
}
|
||||
if len(ifaces) == 0 {
|
||||
return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil
|
||||
}
|
||||
var body strings.Builder
|
||||
for _, iface := range ifaces {
|
||||
ipv4 := "(no IPv4)"
|
||||
if len(iface.IPv4) > 0 {
|
||||
ipv4 = strings.Join(iface.IPv4, ", ")
|
||||
}
|
||||
fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4)
|
||||
}
|
||||
if gw := a.network.DefaultRoute(); gw != "" {
|
||||
fmt.Fprintf(&body, "\nDefault route: %s\n", gw)
|
||||
}
|
||||
return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil
|
||||
}
|
||||
|
||||
func (a *App) DefaultStaticIPv4FormFields(iface string) []string {
|
||||
return []string{
|
||||
"",
|
||||
"24",
|
||||
strings.TrimSpace(a.network.DefaultRoute()),
|
||||
"77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8",
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config {
|
||||
get := func(index int) string {
|
||||
if index >= 0 && index < len(fields) {
|
||||
return strings.TrimSpace(fields[index])
|
||||
}
|
||||
return ""
|
||||
}
|
||||
return platform.StaticIPv4Config{
|
||||
Interface: iface,
|
||||
Address: get(0),
|
||||
Prefix: get(1),
|
||||
Gateway: get(2),
|
||||
DNS: strings.Fields(get(3)),
|
||||
}
|
||||
}
|
||||
370
audit/internal/app/app_packs.go
Normal file
370
audit/internal/app/app_packs.go
Normal file
@@ -0,0 +1,370 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchAutotuneDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||
}
|
||||
|
||||
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||
}
|
||||
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||
if err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUVendor() string {
|
||||
return a.sat.DetectGPUVendor()
|
||||
}
|
||||
|
||||
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return a.sat.ListAMDGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||
}
|
||||
|
||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||
func formatFanStressResult(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
content := strings.TrimSpace(string(raw))
|
||||
kv := parseKeyValueSummary(content)
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(formatSATDetail(content))
|
||||
|
||||
// Append analysis section.
|
||||
var analysis []string
|
||||
if v, ok := kv["throttling_detected"]; ok {
|
||||
label := "NO"
|
||||
if v == "true" {
|
||||
label = "YES ← throttling detected during load"
|
||||
}
|
||||
analysis = append(analysis, "Throttling: "+label)
|
||||
}
|
||||
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||
analysis = append(analysis, "Fan response: "+v+"s")
|
||||
}
|
||||
|
||||
if len(analysis) > 0 {
|
||||
b.WriteString("\n\n=== Analysis ===\n")
|
||||
for _, line := range analysis {
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||
func satResultBody(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
return formatSATDetail(strings.TrimSpace(string(raw)))
|
||||
}
|
||||
67
audit/internal/app/app_services.go
Normal file
67
audit/internal/app/app_services.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListBeeServices() ([]string, error) {
|
||||
return a.services.ListBeeServices()
|
||||
}
|
||||
|
||||
func (a *App) ServiceState(name string) string {
|
||||
return a.services.ServiceState(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatus(name string) (string, error) {
|
||||
return a.services.ServiceStatus(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatusResult(name string) (ActionResult, error) {
|
||||
body, err := a.services.ServiceStatus(name)
|
||||
return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err
|
||||
}
|
||||
|
||||
func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) {
|
||||
return a.services.ServiceDo(name, action)
|
||||
}
|
||||
|
||||
func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) {
|
||||
body, err := a.services.ServiceDo(name, action)
|
||||
return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) TailFile(path string, lines int) string {
|
||||
return a.tools.TailFile(path, lines)
|
||||
}
|
||||
|
||||
func (a *App) CheckTools(names []string) []platform.ToolStatus {
|
||||
return a.tools.CheckTools(names)
|
||||
}
|
||||
|
||||
func (a *App) ToolCheckResult(names []string) ActionResult {
|
||||
if len(names) == 0 {
|
||||
return ActionResult{Title: "Required tools", Body: "No tools checked."}
|
||||
}
|
||||
var body strings.Builder
|
||||
for _, tool := range a.tools.CheckTools(names) {
|
||||
status := "MISSING"
|
||||
if tool.OK {
|
||||
status = "OK (" + tool.Path + ")"
|
||||
}
|
||||
fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status)
|
||||
}
|
||||
return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())}
|
||||
}
|
||||
|
||||
func (a *App) AuditLogTailResult() ActionResult {
|
||||
logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40))
|
||||
jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20))
|
||||
body := strings.TrimSpace(logTail + "\n\n" + jsonTail)
|
||||
if body == "" {
|
||||
body = "No audit logs found."
|
||||
}
|
||||
return ActionResult{Title: "Audit log tail", Body: body}
|
||||
}
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -122,11 +123,14 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||
runNCCLFn func(string, []int) (string, error)
|
||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
@@ -154,6 +158,20 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerBenchFn != nil {
|
||||
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
|
||||
if f.runNvidiaAutotuneFn != nil {
|
||||
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaTargetedStressFn != nil {
|
||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||
@@ -279,10 +297,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNCCLFn != nil {
|
||||
return f.runNCCLFn(baseDir, gpuIndices)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotBaseDir string
|
||||
var gotGPUIndices []int
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||
gotBaseDir = baseDir
|
||||
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||
return "/tmp/nccl-tests.tar.gz", nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("RunNCCLTests error: %v", err)
|
||||
}
|
||||
if path != "/tmp/nccl-tests.tar.gz" {
|
||||
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||
}
|
||||
if gotBaseDir != "/tmp/sat" {
|
||||
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||
}
|
||||
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -767,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -794,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
var manifest string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
@@ -810,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read manifest entry: %v", err)
|
||||
}
|
||||
manifest = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
for _, want := range []string{
|
||||
@@ -853,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(manifest, "files:") {
|
||||
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
|
||||
}
|
||||
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
|
||||
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
|
||||
@@ -2,10 +2,29 @@ package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// readFileLimited reads path into memory, refusing files larger than maxBytes.
|
||||
// Prevents OOM on corrupted or unexpectedly large data files.
|
||||
func readFileLimited(path string, maxBytes int64) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(len(data)) > maxBytes {
|
||||
return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||
|
||||
782
audit/internal/app/blackbox.go
Normal file
782
audit/internal/app/blackbox.go
Normal file
@@ -0,0 +1,782 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
const (
|
||||
blackboxMarkerName = ".bee-blackbox"
|
||||
blackboxDiscoverInterval = 2 * time.Second
|
||||
blackboxMinFlushPeriod = 1 * time.Second
|
||||
blackboxMaxFlushPeriod = 30 * time.Second
|
||||
blackboxRecoveryFastCount = 5
|
||||
)
|
||||
|
||||
var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
|
||||
|
||||
var (
|
||||
blackboxExecCommand = exec.Command
|
||||
blackboxNow = func() time.Time { return time.Now().UTC() }
|
||||
)
|
||||
|
||||
type BlackboxMarker struct {
|
||||
Version int `json:"version"`
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
CreatedAtUTC string `json:"created_at_utc"`
|
||||
Host string `json:"host,omitempty"`
|
||||
}
|
||||
|
||||
type BlackboxTargetStatus struct {
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
Device string `json:"device"`
|
||||
FS platform.RemovableTarget `json:"fs"`
|
||||
BootFolder string `json:"boot_folder"`
|
||||
Status string `json:"status"`
|
||||
LastSyncAtUTC string `json:"last_sync_at_utc,omitempty"`
|
||||
LastCycleDuration string `json:"last_cycle_duration,omitempty"`
|
||||
FlushPeriod string `json:"flush_period"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
Mountpoint string `json:"mountpoint,omitempty"`
|
||||
}
|
||||
|
||||
type BlackboxState struct {
|
||||
Status string `json:"status"`
|
||||
BootStartedAtUTC string `json:"boot_started_at_utc"`
|
||||
BootFolder string `json:"boot_folder"`
|
||||
UpdatedAtUTC string `json:"updated_at_utc"`
|
||||
Targets []BlackboxTargetStatus `json:"targets"`
|
||||
}
|
||||
|
||||
type blackboxRuntime struct {
|
||||
exportDir string
|
||||
statePath string
|
||||
system *platform.System
|
||||
bootStarted time.Time
|
||||
bootFolder string
|
||||
|
||||
mu sync.Mutex
|
||||
workers map[string]*blackboxWorker
|
||||
}
|
||||
|
||||
type discoveredBlackboxTarget struct {
|
||||
marker BlackboxMarker
|
||||
target platform.RemovableTarget
|
||||
seenMount string
|
||||
mountedByBee bool
|
||||
}
|
||||
|
||||
type blackboxWorker struct {
|
||||
runtime *blackboxRuntime
|
||||
enrollmentID string
|
||||
|
||||
mu sync.Mutex
|
||||
target platform.RemovableTarget
|
||||
marker BlackboxMarker
|
||||
mountpoint string
|
||||
mountedByBee bool
|
||||
status string
|
||||
lastSyncAt time.Time
|
||||
lastDuration time.Duration
|
||||
flushPeriod time.Duration
|
||||
lastError string
|
||||
fastCycles int
|
||||
stopCh chan struct{}
|
||||
stoppedCh chan struct{}
|
||||
}
|
||||
|
||||
func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
exportDir = DefaultExportDir
|
||||
}
|
||||
statePath = strings.TrimSpace(statePath)
|
||||
if statePath == "" {
|
||||
statePath = DefaultBlackboxStatePath
|
||||
}
|
||||
if system == nil {
|
||||
system = platform.New()
|
||||
}
|
||||
bootStarted, err := bootStartedAtUTC()
|
||||
if err != nil {
|
||||
bootStarted = blackboxNow()
|
||||
}
|
||||
rt := &blackboxRuntime{
|
||||
exportDir: exportDir,
|
||||
statePath: statePath,
|
||||
system: system,
|
||||
bootStarted: bootStarted,
|
||||
bootFolder: SupportBundleBaseName(bootStarted),
|
||||
workers: make(map[string]*blackboxWorker),
|
||||
}
|
||||
_ = os.MkdirAll(filepath.Dir(statePath), 0755)
|
||||
rt.persistState()
|
||||
ticker := time.NewTicker(blackboxDiscoverInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
rt.reconcile()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
rt.stopAll()
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func ReadBlackboxState(path string) (BlackboxState, error) {
|
||||
path = strings.TrimSpace(path)
|
||||
if path == "" {
|
||||
path = DefaultBlackboxStatePath
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return BlackboxState{}, err
|
||||
}
|
||||
var state BlackboxState
|
||||
if err := json.Unmarshal(raw, &state); err != nil {
|
||||
return BlackboxState{}, err
|
||||
}
|
||||
return state, nil
|
||||
}
|
||||
|
||||
func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
if target.Device == "" {
|
||||
return BlackboxMarker{}, fmt.Errorf("device is required")
|
||||
}
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
|
||||
if err != nil {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
defer func() {
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
}()
|
||||
|
||||
marker, _, err := readBlackboxMarker(mountpoint)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
if marker.EnrollmentID == "" {
|
||||
marker = BlackboxMarker{
|
||||
Version: 1,
|
||||
EnrollmentID: newBlackboxEnrollmentID(),
|
||||
CreatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||
Host: hostnameOr("unknown"),
|
||||
}
|
||||
}
|
||||
if err := writeBlackboxMarker(mountpoint, marker); err != nil {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
return marker, nil
|
||||
}
|
||||
|
||||
func DisableBlackboxTarget(device, enrollmentID string) error {
|
||||
device = strings.TrimSpace(device)
|
||||
enrollmentID = strings.TrimSpace(enrollmentID)
|
||||
if device == "" && enrollmentID == "" {
|
||||
return fmt.Errorf("device or enrollment_id is required")
|
||||
}
|
||||
system := platform.New()
|
||||
targets, err := system.ListRemovableTargets()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, target := range targets {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
|
||||
if mountErr != nil {
|
||||
continue
|
||||
}
|
||||
remove := false
|
||||
marker, _, err := readBlackboxMarker(mountpoint)
|
||||
if err == nil {
|
||||
if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
|
||||
remove = true
|
||||
}
|
||||
if device != "" && target.Device == device {
|
||||
remove = true
|
||||
}
|
||||
}
|
||||
if remove {
|
||||
err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
|
||||
}
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
if remove {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return os.ErrNotExist
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) reconcile() {
|
||||
discovered, _ := rt.discoverMarkedTargets()
|
||||
|
||||
rt.mu.Lock()
|
||||
defer rt.mu.Unlock()
|
||||
|
||||
seen := make(map[string]struct{}, len(discovered))
|
||||
for _, found := range discovered {
|
||||
seen[found.marker.EnrollmentID] = struct{}{}
|
||||
worker, ok := rt.workers[found.marker.EnrollmentID]
|
||||
if !ok {
|
||||
worker = newBlackboxWorker(rt, found)
|
||||
rt.workers[found.marker.EnrollmentID] = worker
|
||||
go worker.run()
|
||||
continue
|
||||
}
|
||||
worker.update(found)
|
||||
}
|
||||
for id, worker := range rt.workers {
|
||||
if _, ok := seen[id]; ok {
|
||||
continue
|
||||
}
|
||||
worker.stop()
|
||||
delete(rt.workers, id)
|
||||
}
|
||||
rt.persistStateLocked()
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) stopAll() {
|
||||
rt.mu.Lock()
|
||||
workers := make([]*blackboxWorker, 0, len(rt.workers))
|
||||
for _, worker := range rt.workers {
|
||||
workers = append(workers, worker)
|
||||
}
|
||||
rt.workers = map[string]*blackboxWorker{}
|
||||
rt.persistStateLocked()
|
||||
rt.mu.Unlock()
|
||||
for _, worker := range workers {
|
||||
worker.stop()
|
||||
}
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
|
||||
targets, err := rt.system.ListRemovableTargets()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out []discoveredBlackboxTarget
|
||||
for _, rawTarget := range targets {
|
||||
target := sanitizeRemovableTarget(rawTarget)
|
||||
if target.Device == "" {
|
||||
continue
|
||||
}
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
marker, ok, err := readBlackboxMarker(mountpoint)
|
||||
if mountedByBee && !ok {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
if err != nil || !ok || marker.EnrollmentID == "" {
|
||||
continue
|
||||
}
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
out = append(out, discoveredBlackboxTarget{
|
||||
marker: marker,
|
||||
target: target,
|
||||
seenMount: mountpoint,
|
||||
mountedByBee: mountedByBee,
|
||||
})
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
|
||||
})
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
|
||||
return &blackboxWorker{
|
||||
runtime: rt,
|
||||
enrollmentID: found.marker.EnrollmentID,
|
||||
target: found.target,
|
||||
marker: found.marker,
|
||||
flushPeriod: blackboxMinFlushPeriod,
|
||||
status: "running",
|
||||
stopCh: make(chan struct{}),
|
||||
stoppedCh: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) run() {
|
||||
defer close(w.stoppedCh)
|
||||
for {
|
||||
start := time.Now()
|
||||
err := w.syncCycle()
|
||||
duration := time.Since(start)
|
||||
w.finishCycle(duration, err)
|
||||
|
||||
wait := w.currentFlushPeriod()
|
||||
timer := time.NewTimer(wait)
|
||||
select {
|
||||
case <-w.stopCh:
|
||||
timer.Stop()
|
||||
w.cleanup()
|
||||
return
|
||||
case <-timer.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.target = found.target
|
||||
w.marker = found.marker
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) stop() {
|
||||
select {
|
||||
case <-w.stopCh:
|
||||
default:
|
||||
close(w.stopCh)
|
||||
}
|
||||
<-w.stoppedCh
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.flushPeriod
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
w.mu.Lock()
|
||||
w.lastDuration = duration
|
||||
if err != nil {
|
||||
w.status = "degraded"
|
||||
w.lastError = err.Error()
|
||||
w.fastCycles = 0
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
|
||||
} else {
|
||||
w.status = "running"
|
||||
w.lastSyncAt = blackboxNow()
|
||||
w.lastError = ""
|
||||
if duration <= w.flushPeriod/2 {
|
||||
w.fastCycles++
|
||||
} else {
|
||||
w.fastCycles = 0
|
||||
}
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
// persistState must be called without w.mu held: it acquires rt.mu then
|
||||
// each worker.mu inside persistStateLocked, so holding w.mu here would
|
||||
// cause a deadlock (w.mu → rt.mu → w.mu).
|
||||
w.runtime.persistState()
|
||||
}
|
||||
|
||||
func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
|
||||
if current <= 0 {
|
||||
current = blackboxMinFlushPeriod
|
||||
}
|
||||
if duration <= 0 {
|
||||
duration = current
|
||||
}
|
||||
next := current
|
||||
if duration > current {
|
||||
growA := time.Duration(float64(current) * 1.25)
|
||||
growB := time.Duration(float64(duration) * 1.25)
|
||||
if growB > growA {
|
||||
next = growB
|
||||
} else {
|
||||
next = growA
|
||||
}
|
||||
}
|
||||
if success && fastCycles >= blackboxRecoveryFastCount {
|
||||
next = time.Duration(float64(current) * 0.9)
|
||||
}
|
||||
if next < blackboxMinFlushPeriod {
|
||||
next = blackboxMinFlushPeriod
|
||||
}
|
||||
if next > blackboxMaxFlushPeriod {
|
||||
next = blackboxMaxFlushPeriod
|
||||
}
|
||||
return next
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) syncCycle() error {
|
||||
target, marker := w.snapshotTarget()
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
w.recordMountpoint(mountpoint, mountedByBee)
|
||||
|
||||
root := filepath.Join(mountpoint, w.runtime.bootFolder)
|
||||
if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := w.captureSnapshots(root); err != nil {
|
||||
return err
|
||||
}
|
||||
return syncFilesystem(root)
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) cleanup() {
|
||||
w.mu.Lock()
|
||||
mountpoint := w.mountpoint
|
||||
mountedByBee := w.mountedByBee
|
||||
w.mu.Unlock()
|
||||
if mountedByBee && mountpoint != "" {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.target, w.marker
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.mountpoint = mountpoint
|
||||
w.mountedByBee = mountedByBee
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) captureSnapshots(root string) error {
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, svc := range supportBundleServices {
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) persistState() {
|
||||
rt.mu.Lock()
|
||||
defer rt.mu.Unlock()
|
||||
rt.persistStateLocked()
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) persistStateLocked() {
|
||||
state := BlackboxState{
|
||||
Status: "disabled",
|
||||
BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
|
||||
BootFolder: rt.bootFolder,
|
||||
UpdatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||
Targets: make([]BlackboxTargetStatus, 0, len(rt.workers)),
|
||||
}
|
||||
if len(rt.workers) > 0 {
|
||||
state.Status = "running"
|
||||
}
|
||||
for _, worker := range rt.workers {
|
||||
worker.mu.Lock()
|
||||
targetState := BlackboxTargetStatus{
|
||||
EnrollmentID: worker.enrollmentID,
|
||||
Device: worker.target.Device,
|
||||
FS: worker.target,
|
||||
BootFolder: rt.bootFolder,
|
||||
Status: worker.status,
|
||||
FlushPeriod: worker.flushPeriod.String(),
|
||||
LastError: worker.lastError,
|
||||
Mountpoint: worker.mountpoint,
|
||||
}
|
||||
if !worker.lastSyncAt.IsZero() {
|
||||
targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
|
||||
}
|
||||
if worker.lastDuration > 0 {
|
||||
targetState.LastCycleDuration = worker.lastDuration.String()
|
||||
}
|
||||
if worker.status == "degraded" {
|
||||
state.Status = "degraded"
|
||||
}
|
||||
worker.mu.Unlock()
|
||||
state.Targets = append(state.Targets, targetState)
|
||||
}
|
||||
sort.Slice(state.Targets, func(i, j int) bool {
|
||||
return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
|
||||
})
|
||||
_ = writeJSONAtomic(rt.statePath, state)
|
||||
}
|
||||
|
||||
func bootStartedAtUTC() (time.Time, error) {
|
||||
raw, err := os.ReadFile("/proc/stat")
|
||||
if err != nil {
|
||||
return time.Time{}, err
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "btime ") {
|
||||
continue
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) != 2 {
|
||||
break
|
||||
}
|
||||
sec, err := time.ParseDuration(parts[1] + "s")
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
return time.Unix(int64(sec/time.Second), 0).UTC(), nil
|
||||
}
|
||||
return time.Time{}, fmt.Errorf("boot time not found")
|
||||
}
|
||||
|
||||
func newBlackboxEnrollmentID() string {
|
||||
var buf [8]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return fmt.Sprintf("bb-%d", time.Now().UnixNano())
|
||||
}
|
||||
return "bb-" + hex.EncodeToString(buf[:])
|
||||
}
|
||||
|
||||
func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
|
||||
target.Device = strings.TrimSpace(target.Device)
|
||||
target.FSType = strings.TrimSpace(target.FSType)
|
||||
target.Size = strings.TrimSpace(target.Size)
|
||||
target.Label = strings.TrimSpace(target.Label)
|
||||
target.Model = strings.TrimSpace(target.Model)
|
||||
target.Mountpoint = strings.TrimSpace(target.Mountpoint)
|
||||
return target
|
||||
}
|
||||
|
||||
func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
if target.Mountpoint != "" {
|
||||
if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
|
||||
return target.Mountpoint, false, nil
|
||||
}
|
||||
}
|
||||
mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
return "", false, err
|
||||
}
|
||||
if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
return "", false, formatBlackboxMountTargetError(target, string(raw), err)
|
||||
}
|
||||
if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
|
||||
_ = unmountTarget(mountpoint)
|
||||
return "", false, err
|
||||
}
|
||||
return mountpoint, true, nil
|
||||
}
|
||||
|
||||
func unmountTarget(mountpoint string) error {
|
||||
_ = blackboxExecCommand("sync").Run()
|
||||
raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(raw))
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
|
||||
raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return BlackboxMarker{}, false, os.ErrNotExist
|
||||
}
|
||||
return BlackboxMarker{}, false, err
|
||||
}
|
||||
var marker BlackboxMarker
|
||||
if err := json.Unmarshal(raw, &marker); err != nil {
|
||||
return BlackboxMarker{}, false, err
|
||||
}
|
||||
return marker, true, nil
|
||||
}
|
||||
|
||||
func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
|
||||
if marker.Version == 0 {
|
||||
marker.Version = 1
|
||||
}
|
||||
return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
|
||||
}
|
||||
|
||||
func syncDirectoryTree(srcDir, dstDir string) error {
|
||||
seen := make(map[string]struct{})
|
||||
err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(srcDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel = filepath.Clean(rel)
|
||||
if rel == "." {
|
||||
seen["."] = struct{}{}
|
||||
return os.MkdirAll(dstDir, 0755)
|
||||
}
|
||||
seen[rel] = struct{}{}
|
||||
dstPath := filepath.Join(dstDir, rel)
|
||||
if d.IsDir() {
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.MkdirAll(dstPath, info.Mode().Perm())
|
||||
}
|
||||
return copyFileIfChanged(path, dstPath)
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return removeMissingPaths(dstDir, seen)
|
||||
}
|
||||
|
||||
func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
|
||||
return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(dstDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel = filepath.Clean(rel)
|
||||
if rel == "." {
|
||||
return nil
|
||||
}
|
||||
if _, ok := seen[rel]; ok {
|
||||
return nil
|
||||
}
|
||||
return os.RemoveAll(path)
|
||||
})
|
||||
}
|
||||
|
||||
func copyFileIfChanged(src, dst string) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return os.MkdirAll(dst, info.Mode().Perm())
|
||||
}
|
||||
srcData, err := os.ReadFile(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
|
||||
return nil
|
||||
}
|
||||
return writeFileAtomic(dst, srcData, info.Mode().Perm())
|
||||
}
|
||||
|
||||
func captureCommandAtomic(dst string, name string, args ...string) error {
|
||||
raw, err := blackboxExecCommand(name, args...).CombinedOutput()
|
||||
if len(raw) == 0 {
|
||||
if err != nil {
|
||||
raw = []byte(err.Error() + "\n")
|
||||
} else {
|
||||
raw = []byte("no output\n")
|
||||
}
|
||||
}
|
||||
return writeFileAtomic(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func writeJSONAtomic(path string, v any) error {
|
||||
raw, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
raw = append(raw, '\n')
|
||||
return writeFileAtomic(path, raw, 0644)
|
||||
}
|
||||
|
||||
func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
|
||||
return nil
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := f.Write(data); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
return err
|
||||
}
|
||||
return syncFilesystem(filepath.Dir(path))
|
||||
}
|
||||
|
||||
func syncFilesystem(path string) error {
|
||||
return blackboxExecCommand("sync").Run()
|
||||
}
|
||||
|
||||
func ensureWritableBlackboxMountpoint(mountpoint string) error {
|
||||
probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||
}
|
||||
name := probe.Name()
|
||||
if closeErr := probe.Close(); closeErr != nil {
|
||||
_ = os.Remove(name)
|
||||
return closeErr
|
||||
}
|
||||
if err := os.Remove(name); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
|
||||
msg := strings.TrimSpace(raw)
|
||||
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||
}
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
52
audit/internal/app/blackbox_test.go
Normal file
52
audit/internal/app/blackbox_test.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
|
||||
current := 2 * time.Second
|
||||
got := adjustFlushPeriod(current, 4*time.Second, false, 0)
|
||||
if got <= current {
|
||||
t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
|
||||
current := 10 * time.Second
|
||||
got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
|
||||
if got >= current {
|
||||
t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
|
||||
}
|
||||
if got < blackboxMinFlushPeriod {
|
||||
t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadBlackboxState(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "blackbox-state.json")
|
||||
want := BlackboxState{
|
||||
Status: "running",
|
||||
BootStartedAtUTC: "2026-04-24T00:00:00Z",
|
||||
BootFolder: "boot-folder",
|
||||
UpdatedAtUTC: "2026-04-24T00:00:01Z",
|
||||
Targets: []BlackboxTargetStatus{{
|
||||
EnrollmentID: "bb-1",
|
||||
Device: "/dev/sdb1",
|
||||
Status: "running",
|
||||
FlushPeriod: "1s",
|
||||
}},
|
||||
}
|
||||
if err := writeJSONAtomic(path, want); err != nil {
|
||||
t.Fatalf("writeJSONAtomic: %v", err)
|
||||
}
|
||||
got, err := ReadBlackboxState(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadBlackboxState: %v", err)
|
||||
}
|
||||
if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
|
||||
t.Fatalf("state=%+v", got)
|
||||
}
|
||||
}
|
||||
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
data, err := readFileLimited(path, 10<<20)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -3,10 +3,11 @@ package app
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -313,17 +314,20 @@ func statusSeverity(status string) int {
|
||||
}
|
||||
|
||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||
return false
|
||||
}
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
class := strings.TrimSpace(*dev.DeviceClass)
|
||||
isGPUClass := strings.Contains(class, "Controller") || strings.Contains(class, "Accelerator") ||
|
||||
strings.Contains(class, "Display") || strings.Contains(class, "Video")
|
||||
if !isGPUClass {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||
switch vendor {
|
||||
case "amd":
|
||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.AMDVendorID
|
||||
case "nvidia":
|
||||
return strings.Contains(manufacturer, "nvidia")
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||
default:
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -46,10 +47,12 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
|
||||
class := "DisplayController"
|
||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||
amdVendorID := collector.AMDVendorID
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
VendorID: &amdVendorID,
|
||||
}},
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bee/audit/internal/platform"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -14,6 +15,7 @@ import (
|
||||
)
|
||||
|
||||
var supportBundleServices = []string{
|
||||
"bee-blackbox.service",
|
||||
"bee-audit.service",
|
||||
"bee-web.service",
|
||||
"bee-network.service",
|
||||
@@ -22,6 +24,10 @@ var supportBundleServices = []string{
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"display-manager.service",
|
||||
"lightdm.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
|
||||
var supportBundleCommands = []struct {
|
||||
@@ -40,14 +46,167 @@ var supportBundleCommands = []struct {
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/dmesg-gui-video-input.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'nvidia|drm|fb|framebuffer|vesa|efi|lightdm|Xorg|input|hid|usb|keyboard|mouse|virtual keyboard|virtual mouse|ami|aspeed|ast' || echo "no GUI/video/input kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/loginctl-sessions.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v loginctl >/dev/null 2>&1; then
|
||||
loginctl list-sessions 2>&1 || true
|
||||
else
|
||||
echo "loginctl not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/loginctl-seats.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v loginctl >/dev/null 2>&1; then
|
||||
loginctl list-seats 2>&1 || true
|
||||
echo
|
||||
for seat in $(loginctl list-seats --no-legend 2>/dev/null | awk '{print $1}'); do
|
||||
echo "=== $seat ==="
|
||||
loginctl seat-status "$seat" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
else
|
||||
echo "loginctl not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ps-gui.txt", cmd: []string{"sh", "-c", `
|
||||
ps -ef | grep -iE 'lightdm|Xorg|X$|openbox|chromium|chrome|xinit|xsession' | grep -v grep || echo "no GUI processes found"
|
||||
`}},
|
||||
{name: "system/lspci-video-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for dev in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== $dev ==="
|
||||
lspci -s "$dev" -vv 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no display-class PCI devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/proc-fb.txt", cmd: []string{"cat", "/proc/fb"}},
|
||||
{name: "system/drm-cards.txt", cmd: []string{"sh", "-c", `
|
||||
if [ -d /sys/class/drm ]; then
|
||||
for path in /sys/class/drm/card*; do
|
||||
[ -e "$path" ] || continue
|
||||
card=$(basename "$path")
|
||||
echo "=== $card ==="
|
||||
for f in status enabled dpms modes; do
|
||||
[ -r "$path/$f" ] && printf " %-8s %s\n" "$f" "$(cat "$path/$f" 2>/dev/null)"
|
||||
done
|
||||
device=$(readlink -f "$path/device" 2>/dev/null || true)
|
||||
[ -n "$device" ] && echo " device ${device##*/}"
|
||||
echo
|
||||
done
|
||||
else
|
||||
echo "/sys/class/drm not present"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/input-devices.txt", cmd: []string{"sh", "-c", `
|
||||
if [ -r /proc/bus/input/devices ]; then
|
||||
cat /proc/bus/input/devices
|
||||
else
|
||||
echo "/proc/bus/input/devices not readable"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/udevadm-input.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v udevadm >/dev/null 2>&1; then
|
||||
echo "udevadm not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for dev in /dev/input/event*; do
|
||||
[ -e "$dev" ] || continue
|
||||
found=1
|
||||
echo "=== $dev ==="
|
||||
udevadm info --query=all --name="$dev" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no /dev/input/event* devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/xinput-list.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v xinput >/dev/null 2>&1; then
|
||||
DISPLAY=:0 xinput --list 2>&1 || true
|
||||
else
|
||||
echo "xinput not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/libinput-list-devices.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v libinput >/dev/null 2>&1; then
|
||||
libinput list-devices 2>&1 || true
|
||||
else
|
||||
echo "libinput not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-gui-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'lightdm|display-manager|Xorg' || echo "no failed GUI units"
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi topo -m 2>&1 || true
|
||||
else
|
||||
echo "nvidia-smi not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||
`}},
|
||||
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||
for candidate in \
|
||||
/usr/bin/nvidia-fabricmanager \
|
||||
/usr/bin/nv-fabricmanager \
|
||||
/usr/bin/nvidia-fabricmanagerd \
|
||||
/usr/bin/nvlsm; do
|
||||
if [ -e "$candidate" ]; then
|
||||
echo "=== $candidate ==="
|
||||
ls -l "$candidate" 2>&1 || true
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||
echo "no fabric manager binaries found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
@@ -195,6 +354,17 @@ var supportBundleOptionalFiles = []struct {
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/Xorg.0.log", src: "/var/log/Xorg.0.log"},
|
||||
{name: "system/Xorg.0.log.old", src: "/var/log/Xorg.0.log.old"},
|
||||
{name: "system/lightdm/lightdm.log", src: "/var/log/lightdm/lightdm.log"},
|
||||
{name: "system/lightdm/x-0.log", src: "/var/log/lightdm/x-0.log"},
|
||||
{name: "system/lightdm/x-0-greeter.log", src: "/var/log/lightdm/x-0-greeter.log"},
|
||||
{name: "system/home-bee-xsession-errors.log", src: "/home/bee/.xsession-errors"},
|
||||
{name: "system/home-bee-chromium-debug.log", src: "/tmp/bee-chrome/chrome_debug.log"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
@@ -212,11 +382,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
@@ -250,7 +415,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archiveName := SupportBundleBaseName(now) + ".tar.gz"
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
@@ -258,6 +423,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func SupportBundleBaseName(at time.Time) string {
|
||||
at = at.UTC()
|
||||
date := at.Format("2006-01-02")
|
||||
tod := at.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
@@ -381,6 +556,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
|
||||
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
|
||||
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
|
||||
if strings.TrimSpace(cfg.Reason) != "" {
|
||||
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&body, "\nfiles:\n")
|
||||
|
||||
var files []string
|
||||
|
||||
@@ -84,11 +84,10 @@ func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
}
|
||||
|
||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
return dev.VendorID != nil && *dev.VendorID == AMDVendorID && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
}
|
||||
|
||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||
|
||||
@@ -3,6 +3,7 @@ package collector
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
var execIpmitool = func(args ...string) (string, error) {
|
||||
out, err := exec.Command("ipmitool", args...).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
||||
|
||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||
func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := execIpmitool("mc", "info")
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
|
||||
raw, err := cmd.Output()
|
||||
if err != nil {
|
||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
version := parseBMCFirmwareRevision(out)
|
||||
version := parseBMCFirmwareRevision(string(raw))
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
@@ -177,15 +174,19 @@ func cleanDMIValue(v string) string {
|
||||
upper := strings.ToUpper(v)
|
||||
placeholders := []string{
|
||||
"TO BE FILLED BY O.E.M.",
|
||||
"TO BE FILLED BY O.E.M",
|
||||
"NOT SPECIFIED",
|
||||
"NOT SETTABLE",
|
||||
"NOT PRESENT",
|
||||
"NOT AVAILABLE",
|
||||
"UNKNOWN",
|
||||
"N/A",
|
||||
"NONE",
|
||||
"NULL",
|
||||
"DEFAULT STRING",
|
||||
"0",
|
||||
"0123456789",
|
||||
"1234567890",
|
||||
}
|
||||
for _, p := range placeholders {
|
||||
if upper == p {
|
||||
|
||||
@@ -84,6 +84,10 @@ func TestCleanDMIValue(t *testing.T) {
|
||||
{" Inspur ", "Inspur"},
|
||||
{"", ""},
|
||||
{"0", ""},
|
||||
{"0123456789", ""},
|
||||
{"1234567890", ""},
|
||||
{"Not Available", ""},
|
||||
{"To Be Filled By O.E.M", ""},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := cleanDMIValue(tt.input)
|
||||
@@ -109,6 +113,80 @@ func TestParseDMIFields(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_Dell(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_dell.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_dell.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "7SG9F63" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "7SG9F63")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "Dell Inc." {
|
||||
t.Errorf("manufacturer: got %v, want Dell Inc.", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "PowerEdge R740xd" {
|
||||
t.Errorf("product_name: got %v, want PowerEdge R740xd", board.ProductName)
|
||||
}
|
||||
// part number comes from type2 Product Name
|
||||
if board.PartNumber == nil || *board.PartNumber != "0F9N89" {
|
||||
t.Errorf("part_number: got %v, want 0F9N89", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_HPE(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_hpe.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_hpe.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "CZJ9320CXN" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "CZJ9320CXN")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "HPE" {
|
||||
t.Errorf("manufacturer: got %v, want HPE", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "ProLiant DL380 Gen10" {
|
||||
t.Errorf("product_name: got %v, want ProLiant DL380 Gen10", board.ProductName)
|
||||
}
|
||||
if board.PartNumber == nil || *board.PartNumber != "ProLiant DL380 Gen10" {
|
||||
t.Errorf("part_number: got %v, want ProLiant DL380 Gen10", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_Supermicro_Placeholders(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_supermicro.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_supermicro.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "S214726X2A36789" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "S214726X2A36789")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "Supermicro" {
|
||||
t.Errorf("manufacturer: got %v, want Supermicro", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "SYS-6028R-WTR" {
|
||||
t.Errorf("product_name: got %v, want SYS-6028R-WTR", board.ProductName)
|
||||
}
|
||||
// "X10DRW-i" is the real part number from type 2
|
||||
if board.PartNumber == nil || *board.PartNumber != "X10DRW-i" {
|
||||
t.Errorf("part_number: got %v, want X10DRW-i", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBIOSFirmware_Dell(t *testing.T) {
|
||||
type0 := mustReadFile(t, "testdata/dmidecode_type0_dell.txt")
|
||||
fw := parseBIOSFirmware(type0)
|
||||
|
||||
if len(fw) != 1 {
|
||||
t.Fatalf("expected 1 firmware record, got %d", len(fw))
|
||||
}
|
||||
if fw[0].Version != "2.5.4" {
|
||||
t.Errorf("version: got %q, want 2.5.4", fw[0].Version)
|
||||
}
|
||||
}
|
||||
|
||||
func mustReadFile(t *testing.T, path string) string {
|
||||
t.Helper()
|
||||
b, err := os.ReadFile(path)
|
||||
|
||||
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
board, biosFW := collectBoard()
|
||||
snap.Board = board
|
||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
|
||||
|
||||
snap.CPUs = collectCPUs()
|
||||
|
||||
@@ -34,19 +34,23 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
}
|
||||
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
bestEffortRescanHotplugStorage()
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||
snap.PowerSupplies = collectPSUs()
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
snap.Sensors = mergeIPMISensors(buildSensorsFromDoc(sensorDoc), collectIPMISensors())
|
||||
snap.EventLogs = append(collectIPMISEL(), collectDmesgErrors()...)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
// remaining collectors added in steps 1.8 – 1.10
|
||||
|
||||
129
audit/internal/collector/dmesg_events.go
Normal file
129
audit/internal/collector/dmesg_events.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// dmesg -T output: [Thu Jun 18 14:23:45 2026] message
|
||||
// dmesg without -T: [ 123.456789] message
|
||||
var dmesgTimestampRE = regexp.MustCompile(`^\[([^\]]+)\]\s*(.*)$`)
|
||||
|
||||
// Keywords that indicate an error or hardware problem worth capturing.
|
||||
var dmesgErrorPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\berr(or)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfail(ed|ure)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfault\b`),
|
||||
regexp.MustCompile(`(?i)\bwarn(ing)?\b`),
|
||||
regexp.MustCompile(`(?i)\bAER\b`),
|
||||
regexp.MustCompile(`(?i)\bXid\b`),
|
||||
regexp.MustCompile(`(?i)\bNVRM\b`),
|
||||
regexp.MustCompile(`(?i)\bpanic\b`),
|
||||
regexp.MustCompile(`(?i)\bcorrected\b`),
|
||||
regexp.MustCompile(`(?i)\buncorrect`),
|
||||
regexp.MustCompile(`(?i)\bECC\b`),
|
||||
regexp.MustCompile(`(?i)\btimeout\b`),
|
||||
regexp.MustCompile(`(?i)\breset\b`),
|
||||
regexp.MustCompile(`(?i)\bdead\b`),
|
||||
regexp.MustCompile(`(?i)\bhang\b`),
|
||||
regexp.MustCompile(`(?i)\bstall\b`),
|
||||
regexp.MustCompile(`(?i)\bdisabled\b`),
|
||||
}
|
||||
|
||||
// collectDmesgErrors runs `dmesg -T` (or `dmesg` without -T on failure) and
|
||||
// returns only lines that match known error/warning patterns.
|
||||
func collectDmesgErrors() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("dmesg", "-T").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
// Fallback: dmesg without human-readable timestamps
|
||||
out, err = exec.Command("dmesg").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
entries := parseDmesgErrors(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("dmesg: collected error entries", "count", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
func parseDmesgErrors(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var timestamp, message string
|
||||
if m := dmesgTimestampRE.FindStringSubmatch(line); m != nil {
|
||||
timestamp = strings.TrimSpace(m[1])
|
||||
message = strings.TrimSpace(m[2])
|
||||
} else {
|
||||
message = line
|
||||
}
|
||||
|
||||
if message == "" {
|
||||
continue
|
||||
}
|
||||
if !matchesAny(message, dmesgErrorPatterns) {
|
||||
continue
|
||||
}
|
||||
|
||||
severity := dmesgSeverity(message)
|
||||
source := "dmesg"
|
||||
|
||||
var eventTime *string
|
||||
if timestamp != "" {
|
||||
t := timestamp
|
||||
eventTime = &t
|
||||
} else {
|
||||
eventTime = &collectedAt
|
||||
}
|
||||
|
||||
entries = append(entries, schema.HardwareEventLog{
|
||||
Source: source,
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
Message: message,
|
||||
})
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func matchesAny(s string, patterns []*regexp.Regexp) bool {
|
||||
for _, p := range patterns {
|
||||
if p.MatchString(s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func dmesgSeverity(msg string) string {
|
||||
lower := strings.ToLower(msg)
|
||||
switch {
|
||||
case strings.Contains(lower, "panic") ||
|
||||
strings.Contains(lower, "aer") ||
|
||||
strings.Contains(lower, "uncorrect") ||
|
||||
strings.Contains(lower, "xid") ||
|
||||
strings.Contains(lower, "nvrm"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "error") ||
|
||||
strings.Contains(lower, "fault") ||
|
||||
strings.Contains(lower, "fail") ||
|
||||
strings.Contains(lower, "dead") ||
|
||||
strings.Contains(lower, "hang"):
|
||||
return statusCritical
|
||||
default:
|
||||
return statusWarning
|
||||
}
|
||||
}
|
||||
92
audit/internal/collector/ipmi_profile.go
Normal file
92
audit/internal/collector/ipmi_profile.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package collector
|
||||
|
||||
// Package-level IPMI tuning profiles.
|
||||
//
|
||||
// Each profile is matched by board manufacturer (already known before PSU
|
||||
// collection runs). The profile drives two things:
|
||||
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
|
||||
// - FRU early-exit — streaming parser stops reading once all PSU entries
|
||||
// are found, avoiding the tail of non-PSU FRU records.
|
||||
//
|
||||
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ipmiProfile holds tuning parameters for one or more board manufacturers.
|
||||
type ipmiProfile struct {
|
||||
// name is shown in log messages.
|
||||
name string
|
||||
// manufacturers is a list of lowercase substrings matched against the
|
||||
// board manufacturer string from dmidecode type 1.
|
||||
manufacturers []string
|
||||
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
|
||||
// command. Zero means no timeout (not recommended).
|
||||
fruTimeout time.Duration
|
||||
// sdrTimeout is the hard deadline for `ipmitool sdr`.
|
||||
sdrTimeout time.Duration
|
||||
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
|
||||
mcInfoTimeout time.Duration
|
||||
// fruEarlyExit instructs the streaming FRU parser to stop reading
|
||||
// after it has found at least one PSU entry and the current block is
|
||||
// complete. Useful on servers with many non-PSU FRU devices.
|
||||
fruEarlyExit bool
|
||||
}
|
||||
|
||||
// ipmiProfiles is the ordered list of profiles. First match wins.
|
||||
var ipmiProfiles = []ipmiProfile{
|
||||
{
|
||||
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
|
||||
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
|
||||
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
|
||||
// exit so collection stops once PSU records are found.
|
||||
name: "lenovo",
|
||||
manufacturers: []string{"lenovo"},
|
||||
fruTimeout: 90 * time.Second,
|
||||
sdrTimeout: 45 * time.Second,
|
||||
mcInfoTimeout: 15 * time.Second,
|
||||
fruEarlyExit: true,
|
||||
},
|
||||
{
|
||||
// HPE iLO-based servers (ProLiant DL/ML/BL).
|
||||
name: "hpe",
|
||||
manufacturers: []string{"hp", "hewlett packard"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
{
|
||||
// Dell iDRAC-based servers.
|
||||
name: "dell",
|
||||
manufacturers: []string{"dell"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
}
|
||||
|
||||
// defaultIPMIProfile is used when no vendor profile matches.
|
||||
var defaultIPMIProfile = ipmiProfile{
|
||||
name: "default",
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
}
|
||||
|
||||
// selectIPMIProfile returns the profile for the given board manufacturer.
|
||||
func selectIPMIProfile(manufacturer string) ipmiProfile {
|
||||
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
|
||||
for _, p := range ipmiProfiles {
|
||||
for _, m := range p.manufacturers {
|
||||
if strings.Contains(mfgLower, m) {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
return defaultIPMIProfile
|
||||
}
|
||||
90
audit/internal/collector/ipmi_sel.go
Normal file
90
audit/internal/collector/ipmi_sel.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISEL runs `ipmitool sel list` and returns parsed event log entries.
|
||||
// Returns nil if ipmitool is unavailable or the SEL is empty.
|
||||
func collectIPMISEL() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("ipmitool", "sel", "list").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
entries := parseIPMISELOutput(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sel: collected", "entries", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
// parseIPMISELOutput parses `ipmitool sel list` output.
|
||||
// Line format: ID | date | time | sensor | event description | direction
|
||||
// Example: 1 | 06/18/2026 | 14:23:45 | Temperature #0x30 | Upper Critical going high | Asserted
|
||||
func parseIPMISELOutput(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, "|", 6)
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
id := strings.TrimSpace(parts[0])
|
||||
date := strings.TrimSpace(parts[1])
|
||||
timeStr := strings.TrimSpace(parts[2])
|
||||
sensor := strings.TrimSpace(parts[3])
|
||||
event := strings.TrimSpace(parts[4])
|
||||
direction := ""
|
||||
if len(parts) == 6 {
|
||||
direction = strings.TrimSpace(parts[5])
|
||||
}
|
||||
|
||||
var eventTime *string
|
||||
if date != "" && timeStr != "" {
|
||||
t := fmt.Sprintf("%s %s", date, timeStr)
|
||||
eventTime = &t
|
||||
}
|
||||
|
||||
message := event
|
||||
if direction != "" && strings.EqualFold(direction, "Deasserted") {
|
||||
message = event + " (Deasserted)"
|
||||
}
|
||||
|
||||
severity := ipmiSELSeverity(event)
|
||||
isActive := !strings.EqualFold(direction, "Deasserted")
|
||||
|
||||
entry := schema.HardwareEventLog{
|
||||
Source: "ipmi-sel",
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
MessageID: &id,
|
||||
Message: message,
|
||||
IsActive: &isActive,
|
||||
}
|
||||
if sensor != "" {
|
||||
entry.ComponentRef = &sensor
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func ipmiSELSeverity(event string) string {
|
||||
lower := strings.ToLower(event)
|
||||
switch {
|
||||
case strings.Contains(lower, "critical") || strings.Contains(lower, "non-recoverable"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "non-critical") || strings.Contains(lower, "warning") || strings.Contains(lower, "degraded"):
|
||||
return statusWarning
|
||||
default:
|
||||
return "info"
|
||||
}
|
||||
}
|
||||
216
audit/internal/collector/ipmi_sensors.go
Normal file
216
audit/internal/collector/ipmi_sensors.go
Normal file
@@ -0,0 +1,216 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISensors runs `ipmitool sensor` and returns parsed sensor readings.
|
||||
// Returns nil if ipmitool is unavailable or produces no output.
|
||||
func collectIPMISensors() *schema.HardwareSensors {
|
||||
out, err := exec.Command("ipmitool", "sensor").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := parseIPMISensorOutput(string(out))
|
||||
if result == nil {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sensors: collected",
|
||||
"fans", len(result.Fans),
|
||||
"temperatures", len(result.Temperatures),
|
||||
"power", len(result.Power),
|
||||
"other", len(result.Other),
|
||||
)
|
||||
return result
|
||||
}
|
||||
|
||||
// parseIPMISensorOutput parses `ipmitool sensor` text output.
|
||||
// Each line: name | value | unit | status | lnr | lcr | lnc | unc | ucr | unr
|
||||
func parseIPMISensorOutput(output string) *schema.HardwareSensors {
|
||||
result := &schema.HardwareSensors{}
|
||||
seen := map[string]struct{}{}
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 4 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
rawVal := strings.TrimSpace(parts[1])
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
status := strings.TrimSpace(parts[3])
|
||||
|
||||
if name == "" || rawVal == "na" || rawVal == "N/A" || rawVal == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
value, err := strconv.ParseFloat(rawVal, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
statusStr := normalizeIPMISensorStatus(status)
|
||||
|
||||
switch {
|
||||
case strings.EqualFold(unit, "RPM"):
|
||||
if duplicateSensor(seen, "fan", name) {
|
||||
continue
|
||||
}
|
||||
rpm := int(value)
|
||||
item := schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Fans = append(result.Fans, item)
|
||||
|
||||
case strings.EqualFold(unit, "degrees C") || strings.EqualFold(unit, "C"):
|
||||
if duplicateSensor(seen, "temp", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareTemperatureSensor{Name: name, Celsius: &value}
|
||||
if len(parts) >= 9 {
|
||||
if unc := parseIPMIThreshold(parts[7]); unc != nil {
|
||||
item.ThresholdWarningCelsius = unc
|
||||
}
|
||||
if ucr := parseIPMIThreshold(parts[8]); ucr != nil {
|
||||
item.ThresholdCriticalCelsius = ucr
|
||||
}
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
} else {
|
||||
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, item)
|
||||
|
||||
case strings.EqualFold(unit, "Volts") || strings.EqualFold(unit, "V"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, VoltageV: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Watts") || strings.EqualFold(unit, "W"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, PowerW: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Amps") || strings.EqualFold(unit, "A"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, CurrentA: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
default:
|
||||
if duplicateSensor(seen, "other", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Other = append(result.Other, item)
|
||||
}
|
||||
}
|
||||
|
||||
if len(result.Fans) == 0 && len(result.Temperatures) == 0 && len(result.Power) == 0 && len(result.Other) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func parseIPMIThreshold(raw string) *float64 {
|
||||
s := strings.TrimSpace(raw)
|
||||
if s == "" || s == "na" || s == "N/A" {
|
||||
return nil
|
||||
}
|
||||
v, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &v
|
||||
}
|
||||
|
||||
func normalizeIPMISensorStatus(s string) string {
|
||||
switch strings.ToLower(s) {
|
||||
case "ok":
|
||||
return statusOK
|
||||
case "cr", "ucr", "lcr":
|
||||
return statusCritical
|
||||
case "nc", "unc", "lnc", "nr", "unr", "lnr":
|
||||
return statusWarning
|
||||
case "ns", "na":
|
||||
return ""
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// mergeIPMISensors appends IPMI sensor entries into existing, skipping names already present.
|
||||
func mergeIPMISensors(existing, ipmi *schema.HardwareSensors) *schema.HardwareSensors {
|
||||
if ipmi == nil {
|
||||
return existing
|
||||
}
|
||||
if existing == nil {
|
||||
return ipmi
|
||||
}
|
||||
|
||||
existingNames := map[string]struct{}{}
|
||||
for _, s := range existing.Fans {
|
||||
existingNames["fan\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Temperatures {
|
||||
existingNames["temp\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Power {
|
||||
existingNames["power\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Other {
|
||||
existingNames["other\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
|
||||
for _, s := range ipmi.Fans {
|
||||
if _, ok := existingNames["fan\x00"+s.Name]; !ok {
|
||||
existing.Fans = append(existing.Fans, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Temperatures {
|
||||
if _, ok := existingNames["temp\x00"+s.Name]; !ok {
|
||||
existing.Temperatures = append(existing.Temperatures, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Power {
|
||||
if _, ok := existingNames["power\x00"+s.Name]; !ok {
|
||||
existing.Power = append(existing.Power, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Other {
|
||||
if _, ok := existingNames["other\x00"+s.Name]; !ok {
|
||||
existing.Other = append(existing.Other, s)
|
||||
}
|
||||
}
|
||||
return existing
|
||||
}
|
||||
87
audit/internal/collector/memory_test.go
Normal file
87
audit/internal/collector/memory_test.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseMemory_Mixed(t *testing.T) {
|
||||
out := mustReadFile(t, "testdata/dmidecode_type17_mixed.txt")
|
||||
dimms := parseMemory(out)
|
||||
|
||||
if len(dimms) != 3 {
|
||||
t.Fatalf("expected 3 DIMMs, got %d", len(dimms))
|
||||
}
|
||||
|
||||
// slot 0: populated, 16 GB Supermicro-style
|
||||
d0 := dimms[0]
|
||||
if d0.Present == nil || !*d0.Present {
|
||||
t.Errorf("dimm0: expected present=true")
|
||||
}
|
||||
if d0.SizeMB == nil || *d0.SizeMB != 16384 {
|
||||
t.Errorf("dimm0: size_mb=%v, want 16384", d0.SizeMB)
|
||||
}
|
||||
if d0.Slot == nil || *d0.Slot != "P1-DIMMA1" {
|
||||
t.Errorf("dimm0: slot=%v, want P1-DIMMA1", d0.Slot)
|
||||
}
|
||||
if d0.Location == nil || *d0.Location != "P0_Node0_Channel0_Dimm0" {
|
||||
t.Errorf("dimm0: location=%v, want P0_Node0_Channel0_Dimm0", d0.Location)
|
||||
}
|
||||
if d0.Manufacturer == nil || *d0.Manufacturer != "Micron" {
|
||||
t.Errorf("dimm0: manufacturer=%v, want Micron", d0.Manufacturer)
|
||||
}
|
||||
if d0.PartNumber == nil || *d0.PartNumber != "36ASF2G72PZ-2G1A2" {
|
||||
t.Errorf("dimm0: part_number=%v, want 36ASF2G72PZ-2G1A2", d0.PartNumber)
|
||||
}
|
||||
if d0.MaxSpeedMHz == nil || *d0.MaxSpeedMHz != 2133 {
|
||||
t.Errorf("dimm0: max_speed_mhz=%v, want 2133", d0.MaxSpeedMHz)
|
||||
}
|
||||
|
||||
// slot 1: empty
|
||||
d1 := dimms[1]
|
||||
if d1.Present == nil || *d1.Present {
|
||||
t.Errorf("dimm1: expected present=false")
|
||||
}
|
||||
if d1.Status == nil || *d1.Status != statusEmpty {
|
||||
t.Errorf("dimm1: status=%v, want %s", d1.Status, statusEmpty)
|
||||
}
|
||||
if d1.SizeMB != nil {
|
||||
t.Errorf("dimm1: size_mb should be nil for empty slot, got %v", d1.SizeMB)
|
||||
}
|
||||
|
||||
// slot 2: populated, 32768 MB Dell-style size
|
||||
d2 := dimms[2]
|
||||
if d2.Present == nil || !*d2.Present {
|
||||
t.Errorf("dimm2: expected present=true")
|
||||
}
|
||||
if d2.SizeMB == nil || *d2.SizeMB != 32768 {
|
||||
t.Errorf("dimm2: size_mb=%v, want 32768", d2.SizeMB)
|
||||
}
|
||||
if d2.Manufacturer == nil || *d2.Manufacturer != "Samsung" {
|
||||
t.Errorf("dimm2: manufacturer=%v, want Samsung", d2.Manufacturer)
|
||||
}
|
||||
if d2.CurrentSpeedMHz == nil || *d2.CurrentSpeedMHz != 2400 {
|
||||
t.Errorf("dimm2: current_speed_mhz=%v, want 2400", d2.CurrentSpeedMHz)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMemorySizeMB(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want int
|
||||
}{
|
||||
{"16 GB", 16384},
|
||||
{"32 GB", 32768},
|
||||
{"8 GB", 8192},
|
||||
{"16384 MB", 16384},
|
||||
{"32768 MB", 32768},
|
||||
{"No Module Installed", 0},
|
||||
{"0", 0},
|
||||
{"", 0},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := parseMemorySizeMB(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("parseMemorySizeMB(%q) = %d, want %d", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
@@ -80,16 +79,7 @@ func enrichPCIeWithMellanox(devs []schema.HardwarePCIeDevice) []schema.HardwareP
|
||||
}
|
||||
|
||||
func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == mellanoxVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil {
|
||||
m := strings.ToLower(*dev.Manufacturer)
|
||||
if strings.Contains(m, "mellanox") || strings.Contains(m, "nvidia networking") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return dev.VendorID != nil && *dev.VendorID == MellanoxVendorID
|
||||
}
|
||||
|
||||
func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
|
||||
|
||||
@@ -55,7 +55,7 @@ func TestEnrichPCIeWithMellanox_mstflint(t *testing.T) {
|
||||
}
|
||||
netIfacesByBDF = func(string) []string { return nil }
|
||||
|
||||
vendorID := mellanoxVendorID
|
||||
vendorID := MellanoxVendorID
|
||||
bdf := "0000:18:00.0"
|
||||
manufacturer := "Mellanox Technologies"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
@@ -99,7 +99,7 @@ func TestEnrichPCIeWithMellanox_fallbackEthtool(t *testing.T) {
|
||||
return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
|
||||
}
|
||||
|
||||
vendorID := mellanoxVendorID
|
||||
vendorID := MellanoxVendorID
|
||||
bdf := "0000:18:00.0"
|
||||
manufacturer := "NVIDIA Networking"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
|
||||
@@ -10,8 +10,6 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
Index int
|
||||
BDF string
|
||||
@@ -240,13 +238,7 @@ func normalizePCIeBDF(bdf string) string {
|
||||
}
|
||||
|
||||
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
return dev.VendorID != nil && *dev.VendorID == NvidiaVendorID
|
||||
}
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
|
||||
@@ -57,7 +57,7 @@ func TestNormalizePCIeBDF(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
vendorID := nvidiaVendorID
|
||||
vendorID := NvidiaVendorID
|
||||
bdf := "0000:65:00.0"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
status := "OK"
|
||||
@@ -104,7 +104,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||
vendorID := nvidiaVendorID
|
||||
vendorID := NvidiaVendorID
|
||||
bdf := "0000:17:00.0"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
devices := []schema.HardwarePCIeDevice{
|
||||
|
||||
11
audit/internal/collector/pci_vendors.go
Normal file
11
audit/internal/collector/pci_vendors.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package collector
|
||||
|
||||
// PCI vendor IDs for hardware classification.
|
||||
// Source: https://pcisig.com / https://pci-ids.ucw.cz/
|
||||
const (
|
||||
NvidiaVendorID = 0x10de
|
||||
AMDVendorID = 0x1002
|
||||
AspeedVendorID = 0x1a03
|
||||
MellanoxVendorID = 0x15b3
|
||||
IntelVendorID = 0x8086
|
||||
)
|
||||
@@ -4,7 +4,9 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -124,35 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
dev.Status = &status
|
||||
|
||||
// Slot is the BDF: "0000:00:02.0"
|
||||
if bdf := fields["Slot"]; bdf != "" {
|
||||
dev.Slot = &bdf
|
||||
dev.BDF = &bdf
|
||||
bdfStr := fields["Slot"]
|
||||
if bdfStr != "" {
|
||||
dev.Slot = &bdfStr
|
||||
dev.BDF = &bdfStr
|
||||
// parse vendor_id and device_id from sysfs
|
||||
vendorID, deviceID := readPCIIDs(bdf)
|
||||
vendorID, deviceID := readPCIIDs(bdfStr)
|
||||
if vendorID != 0 {
|
||||
dev.VendorID = &vendorID
|
||||
}
|
||||
if deviceID != 0 {
|
||||
dev.DeviceID = &deviceID
|
||||
}
|
||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||
if numaNode, ok := readPCINumaNode(bdfStr); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
if group, ok := readPCIIOMMUGroup(bdfStr); ok {
|
||||
dev.IOMMUGroup = &group
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
||||
if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
|
||||
dev.MaxLinkWidth = &width
|
||||
}
|
||||
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
||||
if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
|
||||
linkSpeed := normalizePCILinkSpeed(speed)
|
||||
if linkSpeed != "" {
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
}
|
||||
}
|
||||
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
||||
if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
|
||||
linkSpeed := normalizePCILinkSpeed(speed)
|
||||
if linkSpeed != "" {
|
||||
dev.MaxLinkSpeed = &linkSpeed
|
||||
@@ -173,12 +179,35 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// SVendor/SDevice available but not in schema — skip
|
||||
|
||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||
// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
|
||||
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
|
||||
// containing "NVLINK". The targeted lspci call is only executed for the small
|
||||
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
|
||||
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
|
||||
markNVLinkBridge(&dev)
|
||||
}
|
||||
|
||||
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
return dev
|
||||
}
|
||||
|
||||
// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
|
||||
// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
|
||||
func readPCIIOMMUGroup(bdf string) (int, bool) {
|
||||
link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
|
||||
target, err := os.Readlink(link)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
n, err := strconv.Atoi(filepath.Base(target))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
|
||||
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
|
||||
func readPCIIDs(bdf string) (vendorID, deviceID int) {
|
||||
base := "/sys/bus/pci/devices/" + bdf
|
||||
@@ -245,17 +274,37 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||
// speed is below the maximum negotiated speed supported by both ends.
|
||||
// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
|
||||
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
||||
// get Critical because they are fixed internal connectors that must always train
|
||||
// to max speed — any downgrade signals a hardware fault.
|
||||
//
|
||||
// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and
|
||||
// their link state has no operational impact. This covers management endpoints
|
||||
// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never
|
||||
// activates but that lspci still reports with link stats.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
}
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
return
|
||||
}
|
||||
if dev.BDF != nil {
|
||||
if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 {
|
||||
return
|
||||
}
|
||||
}
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
|
||||
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
|
||||
if isNVLinkBridge {
|
||||
crit := statusCritical
|
||||
dev.Status = &crit
|
||||
} else {
|
||||
warn := statusWarning
|
||||
dev.Status = &warn
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
@@ -0,0 +1,206 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
|
||||
|
||||
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
|
||||
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
|
||||
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
|
||||
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
|
||||
if !isMellanoxDevice(dev) {
|
||||
return false
|
||||
}
|
||||
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
|
||||
return false
|
||||
}
|
||||
if len(netIfacesByBDF(bdf)) > 0 {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
|
||||
// "NVLINK". This is a targeted single-device call, only executed for candidates
|
||||
// already pre-filtered by isNVLinkBridgeCandidate.
|
||||
func confirmNVLinkBridgeDeviceName(bdf string) bool {
|
||||
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
|
||||
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
|
||||
// correct severity (Critical) is applied.
|
||||
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
|
||||
class := "NVLinkBridge"
|
||||
dev.DeviceClass = &class
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
dev.Telemetry["nvlink_bridge"] = true
|
||||
}
|
||||
|
||||
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
|
||||
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
|
||||
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
|
||||
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
|
||||
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
hasBridge := false
|
||||
for _, d := range devs {
|
||||
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
|
||||
hasBridge = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasBridge {
|
||||
return devs
|
||||
}
|
||||
|
||||
topo, err := queryNVIDIANVLinkTopo()
|
||||
if err != nil {
|
||||
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
|
||||
return devs
|
||||
}
|
||||
|
||||
for i := range devs {
|
||||
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
|
||||
continue
|
||||
}
|
||||
if devs[i].Telemetry == nil {
|
||||
devs[i].Telemetry = map[string]any{}
|
||||
}
|
||||
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
|
||||
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
|
||||
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
|
||||
|
||||
// If the bridge PCIe is already degraded AND the fabric is also degraded
|
||||
// (missing NVLink connections), escalate to Critical.
|
||||
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
|
||||
devs[i].Telemetry["nvlink_fabric_affected"] = true
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("nvlink-bridge: topo cross-reference applied",
|
||||
"gpu_count", topo.GPUCount,
|
||||
"all_active", topo.AllActive,
|
||||
"min_links", topo.MinNVLinks,
|
||||
)
|
||||
return devs
|
||||
}
|
||||
|
||||
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
|
||||
type nvlinkTopoResult struct {
|
||||
GPUCount int
|
||||
AllActive bool // true if every GPU pair has at least one NVLink bond
|
||||
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
|
||||
}
|
||||
|
||||
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
|
||||
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
|
||||
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
|
||||
if err != nil {
|
||||
return nvlinkTopoResult{}, err
|
||||
}
|
||||
return parseNVIDIATopologyMatrix(string(out)), nil
|
||||
}
|
||||
|
||||
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
||||
// nvidia-smi topo -m matrix.
|
||||
//
|
||||
// Format (abbreviated):
|
||||
//
|
||||
// GPU0 GPU1 ... NIC0 NIC1
|
||||
// GPU0 X NV18 ... NODE NODE
|
||||
// GPU1 NV18 X ... NODE NODE
|
||||
// NIC0 NODE NODE... X PIX
|
||||
//
|
||||
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
||||
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
||||
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
||||
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
||||
lines := strings.Split(raw, "\n")
|
||||
|
||||
// Locate the header line and record which column indices are GPU columns.
|
||||
headerIdx := -1
|
||||
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
|
||||
var gpuCount int
|
||||
for i, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trimmed, "GPU0") {
|
||||
parts := strings.Fields(trimmed)
|
||||
for j, col := range parts {
|
||||
if strings.HasPrefix(col, "GPU") {
|
||||
gpuColIndices = append(gpuColIndices, j)
|
||||
}
|
||||
}
|
||||
gpuCount = len(gpuColIndices)
|
||||
if gpuCount >= 2 {
|
||||
headerIdx = i
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if headerIdx < 0 || gpuCount == 0 {
|
||||
return nvlinkTopoResult{}
|
||||
}
|
||||
|
||||
minLinks := -1 // -1 = no NV pair seen yet
|
||||
allActive := true
|
||||
|
||||
for _, line := range lines[headerIdx+1:] {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(trimmed, "GPU") {
|
||||
continue
|
||||
}
|
||||
cells := strings.Fields(trimmed)
|
||||
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
|
||||
// gpuColIndices are 0-based within the header fields, so they map to
|
||||
// cells[idx+1] in the data rows (shift by 1 for the row label).
|
||||
for _, colIdx := range gpuColIndices {
|
||||
dataIdx := colIdx + 1
|
||||
if dataIdx >= len(cells) {
|
||||
continue
|
||||
}
|
||||
cell := cells[dataIdx]
|
||||
m := nv5re.FindStringSubmatch(cell)
|
||||
if len(m) != 2 {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(m[1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if n == 0 {
|
||||
allActive = false
|
||||
}
|
||||
if minLinks < 0 || n < minLinks {
|
||||
minLinks = n
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if minLinks < 0 {
|
||||
minLinks = 0
|
||||
}
|
||||
|
||||
return nvlinkTopoResult{
|
||||
GPUCount: gpuCount,
|
||||
AllActive: allActive && minLinks > 0,
|
||||
MinNVLinks: minLinks,
|
||||
}
|
||||
}
|
||||
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
@@ -0,0 +1,124 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseNVIDIATopologyMatrix(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
|
||||
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
|
||||
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
|
||||
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
|
||||
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
|
||||
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
|
||||
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.GPUCount != 8 {
|
||||
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
|
||||
}
|
||||
if !got.AllActive {
|
||||
t.Fatalf("AllActive=false want true")
|
||||
}
|
||||
if got.MinNVLinks != 18 {
|
||||
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
|
||||
input := ` GPU0 GPU1 GPU2 GPU3
|
||||
GPU0 X NV18 NV18 NV18
|
||||
GPU1 NV18 X NV18 NV12
|
||||
GPU2 NV18 NV18 X NV18
|
||||
GPU3 NV18 NV12 NV18 X
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.MinNVLinks != 12 {
|
||||
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
|
||||
}
|
||||
if !got.AllActive {
|
||||
t.Fatalf("AllActive=false want true (12 links is still active)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// GPU0-GPU1 pair fully disconnected (NV0).
|
||||
input := ` GPU0 GPU1
|
||||
GPU0 X NV0
|
||||
GPU1 NV0 X
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.AllActive {
|
||||
t.Fatalf("AllActive=true want false (NV0 means no links)")
|
||||
}
|
||||
if got.MinNVLinks != 0 {
|
||||
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := parseNVIDIATopologyMatrix("no gpus here")
|
||||
if got.GPUCount != 0 {
|
||||
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bridgeClass := "NVLinkBridge"
|
||||
linkSpeed := "Gen3"
|
||||
maxLinkSpeed := "Gen4"
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
dev.DeviceClass = &bridgeClass
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||
s := statusOK
|
||||
dev.Status = &s
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
if dev.Status == nil || *dev.Status != statusCritical {
|
||||
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
|
||||
}
|
||||
if dev.ErrorDescription == nil {
|
||||
t.Fatal("ErrorDescription nil, want degradation message")
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
regularClass := "NetworkController"
|
||||
linkSpeed := "Gen3"
|
||||
maxLinkSpeed := "Gen4"
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
dev.DeviceClass = ®ularClass
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||
s := statusOK
|
||||
dev.Status = &s
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
if dev.Status == nil || *dev.Status != statusWarning {
|
||||
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,8 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
@@ -10,16 +12,29 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectPSUs() []schema.HardwarePowerSupply {
|
||||
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
|
||||
defer fruCancel()
|
||||
|
||||
if profile.fruEarlyExit {
|
||||
psus = collectFRUEarlyExit(fruCtx)
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
|
||||
if out, err := cmd.Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
sdrData := map[int]psuSDR{}
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
|
||||
defer sdrCancel()
|
||||
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
|
||||
if sdrOut, err := cmd.Output(); err == nil {
|
||||
sdrData = parsePSUSDR(string(sdrOut))
|
||||
if len(psus) == 0 {
|
||||
psus = synthesizePSUsFromSDR(sdrData)
|
||||
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
|
||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
slog.Info("psu: collected", "count", len(psus))
|
||||
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
|
||||
return psus
|
||||
}
|
||||
|
||||
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
|
||||
// as soon as it has found all PSU blocks and the next block is not a PSU.
|
||||
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
|
||||
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
|
||||
pipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
slog.Info("psu: fru pipe unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Info("psu: fru start failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
var currentBlock strings.Builder
|
||||
slot := 0
|
||||
psuFound := false
|
||||
stoppedEarly := false
|
||||
|
||||
scanner := bufio.NewScanner(pipe)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if strings.HasPrefix(line, "FRU Device Description") {
|
||||
if currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
psuFound = true
|
||||
slot++
|
||||
}
|
||||
currentBlock.Reset()
|
||||
}
|
||||
// Stop once we've collected PSUs and hit a non-PSU block header.
|
||||
if psuFound && !isPSUHeader(strings.ToLower(line)) {
|
||||
stoppedEarly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
currentBlock.WriteString(line)
|
||||
currentBlock.WriteByte('\n')
|
||||
}
|
||||
|
||||
if !stoppedEarly && currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
}
|
||||
}
|
||||
|
||||
// Kill the process immediately on early exit rather than waiting for context timeout.
|
||||
if cmd.Process != nil {
|
||||
cmd.Process.Kill() //nolint:errcheck
|
||||
}
|
||||
cmd.Wait() //nolint:errcheck
|
||||
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
|
||||
return psus
|
||||
}
|
||||
|
||||
@@ -160,11 +234,57 @@ type psuSDR struct {
|
||||
}
|
||||
|
||||
var psuSlotPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
|
||||
// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
|
||||
// does not fire after the digit; match explicitly with underscore terminator.
|
||||
regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
|
||||
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
|
||||
// Must be last: "power supply N" is already caught by the pattern above.
|
||||
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
|
||||
}
|
||||
|
||||
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_IN, PSU1_PIN
|
||||
// MLT: PSU1_PIN
|
||||
// xFusion: (matched via default fallback — no explicit keyword)
|
||||
// HPE: PS1 Input Power, PS1 Input Watts
|
||||
func isPSUInputPower(name string) bool {
|
||||
return strings.Contains(name, "input power") ||
|
||||
strings.Contains(name, "input watts") ||
|
||||
strings.Contains(name, "_pin") ||
|
||||
strings.Contains(name, " pin") ||
|
||||
strings.Contains(name, "_power_in") ||
|
||||
strings.Contains(name, "power_in")
|
||||
}
|
||||
|
||||
// isPSUOutputPower matches DC-output power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_OUT
|
||||
// MLT: PSU1_POUT
|
||||
// xFusion: PS1 POut
|
||||
func isPSUOutputPower(name string) bool {
|
||||
return strings.Contains(name, "output power") ||
|
||||
strings.Contains(name, "output watts") ||
|
||||
strings.Contains(name, "_pout") ||
|
||||
strings.Contains(name, " pout") ||
|
||||
strings.Contains(name, "_power_out") ||
|
||||
strings.Contains(name, "power_out") ||
|
||||
strings.Contains(name, "power supply bay") ||
|
||||
strings.Contains(name, "psu bay")
|
||||
}
|
||||
|
||||
// parseBoundedFloat parses a numeric value from an SDR value field and
|
||||
// validates it is within (0, max]. Returns nil for zero, negative, or
|
||||
// out-of-range values — these indicate missing/off/fault sensor readings.
|
||||
func parseBoundedFloat(raw string, max float64) *float64 {
|
||||
v := parseFloatPtr(raw)
|
||||
if v == nil || *v <= 0 || *v > max {
|
||||
return nil
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
@@ -194,24 +314,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
|
||||
lowerName := strings.ToLower(name)
|
||||
switch {
|
||||
case strings.Contains(lowerName, "input power"):
|
||||
entry.inputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "output power"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case isPSUInputPower(lowerName):
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
case isPSUOutputPower(lowerName):
|
||||
entry.outputPowerW = parseBoundedFloat(value, 6000)
|
||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||
entry.inputVoltage = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "temp"):
|
||||
entry.temperatureC = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||
entry.healthPct = parsePercentPtr(value)
|
||||
default:
|
||||
// Generic PSU power reading: sensor matched a slot pattern but carries
|
||||
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
|
||||
// AC input if the value looks like wattage and no better data is set yet.
|
||||
if entry.inputPowerW == nil {
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
}
|
||||
}
|
||||
out[slot] = entry
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// PSUSlotPower holds SDR power readings for one PSU slot.
|
||||
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
|
||||
// matching HardwarePowerSupply.Slot in the audit schema.
|
||||
type PSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"`
|
||||
OutputW *float64 `json:"output_w,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
|
||||
// using the same battle-tested slot patterns as the hardware audit collector.
|
||||
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
|
||||
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
|
||||
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
|
||||
sdr := parsePSUSDR(sdrOutput)
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]PSUSlotPower, len(sdr))
|
||||
for slot, entry := range sdr {
|
||||
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
|
||||
out[key] = PSUSlotPower{
|
||||
InputW: entry.inputPowerW,
|
||||
OutputW: entry.outputPowerW,
|
||||
Status: entry.status,
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
|
||||
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
{name: "PWS1 Status", want: 1},
|
||||
{name: "Power Supply Bay 8", want: 8},
|
||||
{name: "PS 6 Input Power", want: 6},
|
||||
// MSI underscore format — \b does not fire between digit and '_'
|
||||
{name: "PSU1_POWER_IN", want: 1},
|
||||
{name: "PSU2_POWER_OUT", want: 2},
|
||||
{name: "PSU4_STATUS", want: 4},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePSUSDRMSIFormat(t *testing.T) {
|
||||
t.Parallel()
|
||||
raw := `
|
||||
PSU1_STATUS | F1h | ok
|
||||
PSU1_POWER_OUT | 928 Watts | ok
|
||||
PSU1_POWER_IN | 976 Watts | ok
|
||||
PSU2_STATUS | F2h | ok
|
||||
PSU2_POWER_OUT | 944 Watts | ok
|
||||
PSU2_POWER_IN | 992 Watts | ok
|
||||
`
|
||||
got := parsePSUSDR(raw)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d want 2", len(got))
|
||||
}
|
||||
if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
|
||||
t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
|
||||
}
|
||||
if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
|
||||
t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
|
||||
}
|
||||
if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
|
||||
t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
|
||||
return arrays
|
||||
}
|
||||
|
||||
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
|
||||
// Returns nil when VROC is absent or the platform does not report a license.
|
||||
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
|
||||
if !hasVROCController(pcie) {
|
||||
return nil
|
||||
}
|
||||
out, err := raidToolQuery("mdadm", "--detail-platform")
|
||||
if err != nil {
|
||||
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
return parseMDAdmPlatformLicense(string(out))
|
||||
}
|
||||
|
||||
func parseMDAdmPlatformLicense(raw string) *string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(trimmed, ":"); idx >= 0 {
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val != "" {
|
||||
v := strings.ToLower(val)
|
||||
return &v
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func queryDeviceSerial(devPath string) string {
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
var ctrl nvmeIDCtrl
|
||||
|
||||
@@ -58,7 +58,6 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
location := sensorLocation(chip)
|
||||
|
||||
keys := make([]string, 0, len(features))
|
||||
for key := range features {
|
||||
@@ -80,25 +79,25 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
}
|
||||
switch classifySensorFeature(feature) {
|
||||
case "fan":
|
||||
item := buildFanSensor(name, location, feature)
|
||||
item := buildFanSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Fans = append(result.Fans, *item)
|
||||
case "temp":
|
||||
item := buildTempSensor(name, location, feature)
|
||||
item := buildTempSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, *item)
|
||||
case "power":
|
||||
item := buildPowerSensor(name, location, feature)
|
||||
item := buildPowerSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Power = append(result.Power, *item)
|
||||
default:
|
||||
item := buildOtherSensor(name, location, feature)
|
||||
item := buildOtherSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||
continue
|
||||
}
|
||||
@@ -128,14 +127,6 @@ func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func sensorLocation(chip string) *string {
|
||||
chip = strings.TrimSpace(chip)
|
||||
if chip == "" {
|
||||
return nil
|
||||
}
|
||||
return &chip
|
||||
}
|
||||
|
||||
func classifySensorFeature(feature map[string]any) string {
|
||||
for key := range feature {
|
||||
switch {
|
||||
@@ -154,24 +145,24 @@ func classifySensorFeature(feature map[string]any) string {
|
||||
return "other"
|
||||
}
|
||||
|
||||
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
func buildFanSensor(name string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
rpm, ok := firstFeatureInt(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
||||
item := &schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if status := sensorStatusFromFeature(feature); status != nil {
|
||||
item.Status = status
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
func buildTempSensor(name string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Celsius: &celsius}
|
||||
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||
item.ThresholdWarningCelsius = &warning
|
||||
}
|
||||
@@ -186,8 +177,8 @@ func buildTempSensor(name string, location *string, feature map[string]any) *sch
|
||||
return item
|
||||
}
|
||||
|
||||
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
||||
func buildPowerSensor(name string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name}
|
||||
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||
item.PowerW = &v
|
||||
}
|
||||
@@ -206,12 +197,12 @@ func buildPowerSensor(name string, location *string, feature map[string]any) *sc
|
||||
return item
|
||||
}
|
||||
|
||||
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
func buildOtherSensor(name string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
value, unit, ok := firstGenericSensorValue(feature)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
|
||||
@@ -4,12 +4,70 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
pciRescanPath = "/sys/bus/pci/rescan"
|
||||
scsiHostScanGlob = "/sys/class/scsi_host/host*/scan"
|
||||
hotplugWriteFile = os.WriteFile
|
||||
hotplugExecCommand = exec.Command
|
||||
hotplugGlob = filepath.Glob
|
||||
nvmeLBAFCompactRE = regexp.MustCompile(`(?im)^\s*lbaf\s+\d+\s*:\s*ms:(\d+)\s+lbads:(\d+).*?\(in use\)\s*$`)
|
||||
nvmeLBAFVerboseRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+\d+\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*?\(in use\)\s*$`)
|
||||
sgReadcapBlockRE = regexp.MustCompile(`(?im)logical block length\s*=\s*(\d+)\s+bytes`)
|
||||
sgReadcapProtRE = regexp.MustCompile(`(?im)prot_en\s*=\s*1`)
|
||||
)
|
||||
|
||||
func bestEffortRescanHotplugStorage() {
|
||||
if err := hotplugWriteFile(pciRescanPath, []byte("1\n"), 0644); err != nil {
|
||||
slog.Info("storage: pci rescan skipped", "path", pciRescanPath, "err", err)
|
||||
} else {
|
||||
slog.Info("storage: triggered pci rescan for hotplug discovery")
|
||||
}
|
||||
|
||||
hostPaths, err := hotplugGlob(scsiHostScanGlob)
|
||||
if err != nil {
|
||||
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||
} else {
|
||||
for _, path := range hostPaths {
|
||||
// SAS HBAs (e.g. smartpqi) block indefinitely in sas_user_scan when
|
||||
// written to — SAS topology is discovered by the driver itself.
|
||||
// Detect via two methods: (1) sas_host class registration, and
|
||||
// (2) driver proc_name — smartpqi uses scsi_transport_sas but does
|
||||
// not register a sas_host object, so (1) alone misses it.
|
||||
host := filepath.Base(filepath.Dir(path))
|
||||
if _, err := os.Stat("/sys/class/sas_host/" + host); err == nil {
|
||||
slog.Info("storage: scsi host scan skipped (SAS host)", "path", path)
|
||||
continue
|
||||
}
|
||||
if procName, err := os.ReadFile("/sys/class/scsi_host/" + host + "/proc_name"); err == nil {
|
||||
switch strings.TrimSpace(string(procName)) {
|
||||
case "smartpqi", "hpsa":
|
||||
slog.Info("storage: scsi host scan skipped (SAS transport driver)",
|
||||
"path", path, "driver", strings.TrimSpace(string(procName)))
|
||||
continue
|
||||
}
|
||||
}
|
||||
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||
continue
|
||||
}
|
||||
slog.Info("storage: triggered scsi host scan", "path", path)
|
||||
}
|
||||
}
|
||||
|
||||
out, err := hotplugExecCommand("udevadm", "settle", "--timeout=10").CombinedOutput()
|
||||
if err != nil {
|
||||
slog.Info("storage: udev settle after hotplug rescan failed", "err", err, "output", strings.TrimSpace(string(out)))
|
||||
}
|
||||
}
|
||||
|
||||
func collectStorage() []schema.HardwareStorage {
|
||||
devs := discoverStorageDevices()
|
||||
result := make([]schema.HardwareStorage, 0, len(devs))
|
||||
@@ -26,15 +84,41 @@ func collectStorage() []schema.HardwareStorage {
|
||||
return result
|
||||
}
|
||||
|
||||
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
|
||||
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
|
||||
// but older versions emit them as strings. This type handles both.
|
||||
type jsonInt64 int64
|
||||
|
||||
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
||||
// bare number: 512
|
||||
var n int64
|
||||
if err := json.Unmarshal(data, &n); err == nil {
|
||||
*j = jsonInt64(n)
|
||||
return nil
|
||||
}
|
||||
// quoted string: "512"
|
||||
var s string
|
||||
if err := json.Unmarshal(data, &s); err == nil {
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
||||
if err == nil {
|
||||
*j = jsonInt64(n)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return nil // null or unexpected type — leave zero
|
||||
}
|
||||
|
||||
// lsblkDevice is a minimal lsblk JSON record.
|
||||
type lsblkDevice struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
LogSec jsonInt64 `json:"log-sec"`
|
||||
PhySec jsonInt64 `json:"phy-sec"`
|
||||
}
|
||||
|
||||
type lsblkRoot struct {
|
||||
@@ -101,7 +185,7 @@ func isVirtualHDiskModel(model string) bool {
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL,LOG-SEC,PHY-SEC").Output()
|
||||
if err != nil {
|
||||
slog.Warn("storage: lsblk failed", "err", err)
|
||||
return nil
|
||||
@@ -208,6 +292,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
s := schema.HardwareStorage{Present: &present}
|
||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||
applyStorageBlockGeometry(&s, dev)
|
||||
|
||||
tran := strings.ToLower(dev.Tran)
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -250,6 +335,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
|
||||
var info smartctlInfo
|
||||
var raw map[string]any
|
||||
_ = json.Unmarshal(out, &raw)
|
||||
if err := json.Unmarshal(out, &info); err == nil {
|
||||
if v := cleanDMIValue(info.ModelName); v != "" {
|
||||
s.Model = &v
|
||||
@@ -302,8 +389,11 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
value := float64(attr.Raw.Value)
|
||||
s.LifeRemainingPct = &value
|
||||
case 241:
|
||||
value := attr.Raw.Value
|
||||
value := smartLBAsToBytes(attr.Raw.Value)
|
||||
s.WrittenBytes = &value
|
||||
case 242:
|
||||
value := smartLBAsToBytes(attr.Raw.Value)
|
||||
s.ReadBytes = &value
|
||||
case 197:
|
||||
pending = attr.Raw.Value
|
||||
s.CurrentPendingSectors = &pending
|
||||
@@ -321,6 +411,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
offlineUncorrectable: uncorrectable,
|
||||
lifeRemainingPct: lifeRemaining,
|
||||
}
|
||||
applySCSISmartctlTelemetry(&s, raw, &status)
|
||||
applySCSIProtectionBlockGeometry(&s, devPath)
|
||||
setStorageHealthStatus(&s, status)
|
||||
return s
|
||||
}
|
||||
@@ -332,20 +424,23 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
|
||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||
// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
|
||||
// so all numeric fields use jsonInt64 which accepts both bare numbers and
|
||||
// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
|
||||
type nvmeSmartLog struct {
|
||||
CriticalWarning int `json:"critical_warning"`
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
AvailableSpare int `json:"available_spare"`
|
||||
SpareThreshold int `json:"spare_thresh"`
|
||||
Temperature int64 `json:"temperature"`
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycles int64 `json:"power_cycles"`
|
||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead int64 `json:"data_units_read"`
|
||||
DataUnitsWritten int64 `json:"data_units_written"`
|
||||
ControllerBusy int64 `json:"controller_busy_time"`
|
||||
MediaErrors int64 `json:"media_errors"`
|
||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||
CriticalWarning jsonInt64 `json:"critical_warning"`
|
||||
PercentageUsed jsonInt64 `json:"percent_used"`
|
||||
AvailableSpare jsonInt64 `json:"avail_spare"`
|
||||
SpareThreshold jsonInt64 `json:"spare_thresh"`
|
||||
Temperature jsonInt64 `json:"temperature"`
|
||||
PowerOnHours jsonInt64 `json:"power_on_hours"`
|
||||
PowerCycles jsonInt64 `json:"power_cycles"`
|
||||
UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead jsonInt64 `json:"data_units_read"`
|
||||
DataUnitsWritten jsonInt64 `json:"data_units_written"`
|
||||
ControllerBusy jsonInt64 `json:"controller_busy_time"`
|
||||
MediaErrors jsonInt64 `json:"media_errors"`
|
||||
NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||
@@ -368,6 +463,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
Interface: &iface,
|
||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||
}
|
||||
applyStorageBlockGeometry(&s, dev)
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
||||
@@ -402,19 +498,23 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
}
|
||||
}
|
||||
applyNVMeBlockGeometry(&s, devPath)
|
||||
|
||||
// smart-log: wear telemetry
|
||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||
var log nvmeSmartLog
|
||||
if json.Unmarshal(out, &log) == nil {
|
||||
if log.PowerOnHours > 0 {
|
||||
s.PowerOnHours = &log.PowerOnHours
|
||||
v := int64(log.PowerOnHours)
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if log.PowerCycles > 0 {
|
||||
s.PowerCycles = &log.PowerCycles
|
||||
v := int64(log.PowerCycles)
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
if log.UnsafeShutdowns > 0 {
|
||||
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
||||
v := int64(log.UnsafeShutdowns)
|
||||
s.UnsafeShutdowns = &v
|
||||
}
|
||||
if log.PercentageUsed > 0 {
|
||||
v := float64(log.PercentageUsed)
|
||||
@@ -423,11 +523,11 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
s.LifeRemainingPct = &remaining
|
||||
}
|
||||
if log.DataUnitsWritten > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
||||
v := nvmeDataUnitsToBytes(int64(log.DataUnitsWritten))
|
||||
s.WrittenBytes = &v
|
||||
}
|
||||
if log.DataUnitsRead > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
||||
v := nvmeDataUnitsToBytes(int64(log.DataUnitsRead))
|
||||
s.ReadBytes = &v
|
||||
}
|
||||
if log.AvailableSpare > 0 {
|
||||
@@ -435,23 +535,25 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
s.AvailableSparePct = &v
|
||||
}
|
||||
if log.MediaErrors > 0 {
|
||||
s.MediaErrors = &log.MediaErrors
|
||||
v := int64(log.MediaErrors)
|
||||
s.MediaErrors = &v
|
||||
}
|
||||
if log.NumErrLogEntries > 0 {
|
||||
s.ErrorLogEntries = &log.NumErrLogEntries
|
||||
v := int64(log.NumErrLogEntries)
|
||||
s.ErrorLogEntries = &v
|
||||
}
|
||||
if log.Temperature > 0 {
|
||||
v := float64(log.Temperature - 273)
|
||||
s.TemperatureC = &v
|
||||
}
|
||||
setStorageHealthStatus(&s, storageHealthStatus{
|
||||
criticalWarning: log.CriticalWarning,
|
||||
criticalWarning: int(log.CriticalWarning),
|
||||
percentageUsed: int64(log.PercentageUsed),
|
||||
availableSpare: int64(log.AvailableSpare),
|
||||
spareThreshold: int64(log.SpareThreshold),
|
||||
unsafeShutdowns: log.UnsafeShutdowns,
|
||||
mediaErrors: log.MediaErrors,
|
||||
errorLogEntries: log.NumErrLogEntries,
|
||||
unsafeShutdowns: int64(log.UnsafeShutdowns),
|
||||
mediaErrors: int64(log.MediaErrors),
|
||||
errorLogEntries: int64(log.NumErrLogEntries),
|
||||
})
|
||||
return s
|
||||
}
|
||||
@@ -477,6 +579,251 @@ func nvmeDataUnitsToBytes(units int64) int64 {
|
||||
return units * 512000
|
||||
}
|
||||
|
||||
func smartLBAsToBytes(lbas int64) int64 {
|
||||
if lbas <= 0 {
|
||||
return 0
|
||||
}
|
||||
return lbas * 512
|
||||
}
|
||||
|
||||
func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, status *storageHealthStatus) {
|
||||
if s == nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:power_on_time.hours",
|
||||
"path:accumulated_power_on_time.hours",
|
||||
"path:power_on_time.hour",
|
||||
"path:accumulated_power_on_time.hour",
|
||||
); ok && v > 0 && s.PowerOnHours == nil {
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:power_cycle_count",
|
||||
"path:start_stop_cycle_count",
|
||||
"path:accumulated_start_stop_cycles",
|
||||
); ok && v > 0 && s.PowerCycles == nil {
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:scsi_grown_defect_list",
|
||||
"path:grown_defect_list",
|
||||
); ok && v > 0 && s.ReallocatedSectors == nil {
|
||||
s.ReallocatedSectors = &v
|
||||
if status != nil && status.reallocatedSectors == 0 {
|
||||
status.reallocatedSectors = v
|
||||
}
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:percentage_used_endurance_indicator",
|
||||
"path:scsi_percentage_used_endurance_indicator",
|
||||
); ok && v > 0 {
|
||||
if s.LifeUsedPct == nil {
|
||||
fv := float64(v)
|
||||
s.LifeUsedPct = &fv
|
||||
}
|
||||
if s.LifeRemainingPct == nil && v <= 100 {
|
||||
remaining := float64(100 - v)
|
||||
s.LifeRemainingPct = &remaining
|
||||
if status != nil && status.lifeRemainingPct == 0 {
|
||||
status.lifeRemainingPct = int64(remaining)
|
||||
}
|
||||
}
|
||||
}
|
||||
blockSize, hasBlockSize := firstInt64(raw,
|
||||
"path:logical_block_size",
|
||||
"path:block_size",
|
||||
"path:user_capacity.block_size",
|
||||
)
|
||||
if hasBlockSize && blockSize > 0 {
|
||||
if s.LogicalBlockSizeBytes == nil {
|
||||
s.LogicalBlockSizeBytes = &blockSize
|
||||
}
|
||||
if s.MetadataBytesPerBlock == nil {
|
||||
zero := int64(0)
|
||||
s.MetadataBytesPerBlock = &zero
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
s.Telemetry["logical_block_size_bytes"] = *s.LogicalBlockSizeBytes
|
||||
s.Telemetry["metadata_bytes_per_block"] = *s.MetadataBytesPerBlock
|
||||
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:logical_blocks_written",
|
||||
"path:total_lbas_written",
|
||||
); ok && v > 0 && s.WrittenBytes == nil {
|
||||
bytes := v * blockSize
|
||||
s.WrittenBytes = &bytes
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:logical_blocks_read",
|
||||
"path:total_lbas_read",
|
||||
); ok && v > 0 && s.ReadBytes == nil {
|
||||
bytes := v * blockSize
|
||||
s.ReadBytes = &bytes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
logical := int64(dev.LogSec)
|
||||
physical := int64(dev.PhySec)
|
||||
if logical <= 0 && physical <= 0 {
|
||||
return
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
if logical > 0 {
|
||||
s.LogicalBlockSizeBytes = &logical
|
||||
s.Telemetry["logical_block_size_bytes"] = logical
|
||||
if s.MetadataBytesPerBlock == nil {
|
||||
zero := int64(0)
|
||||
s.MetadataBytesPerBlock = &zero
|
||||
s.Telemetry["metadata_bytes_per_block"] = zero
|
||||
}
|
||||
}
|
||||
if physical > 0 {
|
||||
s.PhysicalBlockSizeBytes = &physical
|
||||
s.Telemetry["physical_block_size_bytes"] = physical
|
||||
}
|
||||
if s.LogicalBlockSizeBytes != nil && s.MetadataBytesPerBlock != nil {
|
||||
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||
}
|
||||
}
|
||||
|
||||
func applyNVMeBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||
return
|
||||
}
|
||||
out, err := exec.Command("nvme", "id-ns", devPath, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(string(out))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func applySCSIProtectionBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||
return
|
||||
}
|
||||
out, err := exec.Command("sg_readcap", "-l", devPath).CombinedOutput()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(string(out))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func setStorageBlockGeometry(s *schema.HardwareStorage, dataBytes, metadataBytes int64) {
|
||||
if s == nil || dataBytes <= 0 || metadataBytes < 0 {
|
||||
return
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
s.LogicalBlockSizeBytes = &dataBytes
|
||||
s.MetadataBytesPerBlock = &metadataBytes
|
||||
s.Telemetry["logical_block_size_bytes"] = dataBytes
|
||||
s.Telemetry["metadata_bytes_per_block"] = metadataBytes
|
||||
s.Telemetry["block_format"] = formatBlockFormat(dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func formatBlockFormat(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func parseNVMeBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||
if m := nvmeLBAFCompactRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||
lbads, errLBADS := strconv.ParseInt(m[2], 10, 64)
|
||||
if errMS == nil && errLBADS == nil && lbads >= 0 && lbads < 63 {
|
||||
return 1 << lbads, ms, true
|
||||
}
|
||||
}
|
||||
if m := nvmeLBAFVerboseRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||
ds, errDS := strconv.ParseInt(m[2], 10, 64)
|
||||
if errMS == nil && errDS == nil && ds > 0 {
|
||||
return ds, ms, true
|
||||
}
|
||||
}
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
func parseSCSIBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||
m := sgReadcapBlockRE.FindStringSubmatch(raw)
|
||||
if len(m) != 2 {
|
||||
return 0, 0, false
|
||||
}
|
||||
blockBytes, err := strconv.ParseInt(m[1], 10, 64)
|
||||
if err != nil || blockBytes <= 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
if sgReadcapProtRE.MatchString(raw) {
|
||||
return blockBytes, 8, true
|
||||
}
|
||||
return blockBytes, 0, true
|
||||
}
|
||||
|
||||
func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
|
||||
for _, candidate := range candidates {
|
||||
if !strings.HasPrefix(candidate, "path:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimPrefix(candidate, "path:")
|
||||
if v, ok := nestedInt64(root, strings.Split(path, ".")); ok {
|
||||
return v, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func nestedInt64(root map[string]any, path []string) (int64, bool) {
|
||||
var current any = root
|
||||
for _, key := range path {
|
||||
obj, ok := current.(map[string]any)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
current, ok = obj[key]
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
switch v := current.(type) {
|
||||
case float64:
|
||||
return int64(v), true
|
||||
case float32:
|
||||
return int64(v), true
|
||||
case int:
|
||||
return int64(v), true
|
||||
case int64:
|
||||
return v, true
|
||||
case int32:
|
||||
return int64(v), true
|
||||
case json.Number:
|
||||
n, err := v.Int64()
|
||||
return n, err == nil
|
||||
case string:
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
|
||||
return n, err == nil
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type storageHealthStatus struct {
|
||||
hasOverall bool
|
||||
overallPassed bool
|
||||
|
||||
69
audit/internal/collector/storage_block_format_test.go
Normal file
69
audit/internal/collector/storage_block_format_test.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseNVMeBlockFormatCompact(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 0 {
|
||||
t.Fatalf("got %d+%d want 512+0", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeBlockFormatVerbose(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
LBA Format 0 : Metadata Size: 8 bytes - Data Size: 512 bytes - Relative Performance: 0 Better (in use)
|
||||
LBA Format 1 : Metadata Size: 0 bytes - Data Size: 4096 bytes - Relative Performance: 1 Best
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 8 {
|
||||
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSCSIBlockFormatWithProtection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
Read Capacity results:
|
||||
Protection: prot_en=1, p_type=1, p_i_exponent=0
|
||||
Logical block length=512 bytes
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 8 {
|
||||
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSCSIBlockFormatWithoutProtection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
Read Capacity results:
|
||||
Protection: prot_en=0, p_type=0, p_i_exponent=0
|
||||
Logical block length=4096 bytes
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 4096 || metadataBytes != 0 {
|
||||
t.Fatalf("got %d+%d want 4096+0", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,13 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -31,3 +38,130 @@ func TestParseStorageBytes(t *testing.T) {
|
||||
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
|
||||
// Older versions emit quoted strings. Both must parse without error
|
||||
// so that the entire lsblkDevices() call does not return nil on Debian 12.
|
||||
cases := []struct {
|
||||
json string
|
||||
want int64
|
||||
}{
|
||||
{`512`, 512},
|
||||
{`4096`, 4096},
|
||||
{`"512"`, 512},
|
||||
{`"4096"`, 4096},
|
||||
{`null`, 0},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
var v jsonInt64
|
||||
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
||||
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
||||
}
|
||||
if int64(v) != tc.want {
|
||||
t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
|
||||
}
|
||||
}
|
||||
|
||||
// Simulate the exact JSON shape that triggered the bug on Debian 12.
|
||||
input := []byte(`{
|
||||
"blockdevices": [
|
||||
{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
|
||||
{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
|
||||
]
|
||||
}`)
|
||||
var root lsblkRoot
|
||||
if err := json.Unmarshal(input, &root); err != nil {
|
||||
t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
|
||||
}
|
||||
if len(root.Blockdevices) != 2 {
|
||||
t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
|
||||
}
|
||||
if int64(root.Blockdevices[0].LogSec) != 512 {
|
||||
t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
|
||||
}
|
||||
if int64(root.Blockdevices[0].PhySec) != 4096 {
|
||||
t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
rescanPath := filepath.Join(tmp, "pci-rescan")
|
||||
scanDir := filepath.Join(tmp, "scsi_host")
|
||||
host0Path := filepath.Join(scanDir, "host0", "scan")
|
||||
host1Path := filepath.Join(scanDir, "host1", "scan")
|
||||
argsPath := filepath.Join(tmp, "udevadm-args")
|
||||
toolPath := filepath.Join(tmp, "udevadm")
|
||||
if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
|
||||
t.Fatalf("mkdir host0: %v", err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
|
||||
t.Fatalf("mkdir host1: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(host0Path, nil, 0644); err != nil {
|
||||
t.Fatalf("touch host0 scan: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(host1Path, nil, 0644); err != nil {
|
||||
t.Fatalf("touch host1 scan: %v", err)
|
||||
}
|
||||
script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
|
||||
if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
|
||||
t.Fatalf("write udevadm stub: %v", err)
|
||||
}
|
||||
|
||||
oldPath := os.Getenv("PATH")
|
||||
if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
|
||||
t.Fatalf("set PATH: %v", err)
|
||||
}
|
||||
defer func() { _ = os.Setenv("PATH", oldPath) }()
|
||||
|
||||
oldRescanPath := pciRescanPath
|
||||
oldSCSIGlob := scsiHostScanGlob
|
||||
oldWriteFile := hotplugWriteFile
|
||||
oldExecCommand := hotplugExecCommand
|
||||
oldGlob := hotplugGlob
|
||||
pciRescanPath = rescanPath
|
||||
scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
|
||||
hotplugWriteFile = os.WriteFile
|
||||
hotplugExecCommand = exec.Command
|
||||
hotplugGlob = filepath.Glob
|
||||
defer func() {
|
||||
pciRescanPath = oldRescanPath
|
||||
scsiHostScanGlob = oldSCSIGlob
|
||||
hotplugWriteFile = oldWriteFile
|
||||
hotplugExecCommand = oldExecCommand
|
||||
hotplugGlob = oldGlob
|
||||
}()
|
||||
|
||||
bestEffortRescanHotplugStorage()
|
||||
|
||||
raw, err := os.ReadFile(rescanPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read rescan file: %v", err)
|
||||
}
|
||||
if string(raw) != "1\n" {
|
||||
t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
|
||||
}
|
||||
for _, path := range []string{host0Path, host1Path} {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read scsi scan file %s: %v", path, err)
|
||||
}
|
||||
if string(raw) != "- - -\n" {
|
||||
t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
|
||||
}
|
||||
}
|
||||
|
||||
args, err := os.ReadFile(argsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read udevadm args: %v", err)
|
||||
}
|
||||
if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
|
||||
t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,65 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
||||
// counters are quoted strings and field names differ from NVMe spec prose)
|
||||
// is correctly parsed into nvmeSmartLog.
|
||||
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Real nvme-cli output: counters are JSON strings, spare is "avail_spare",
|
||||
// percentage used is "percent_used".
|
||||
raw := `{
|
||||
"critical_warning": 0,
|
||||
"temperature": 310,
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 5,
|
||||
"percent_used": 0,
|
||||
"data_units_read": "10925415",
|
||||
"data_units_written": "8497672",
|
||||
"controller_busy_time": "305",
|
||||
"power_cycles": "53",
|
||||
"power_on_hours": "49",
|
||||
"unsafe_shutdowns": "22",
|
||||
"media_errors": "0",
|
||||
"num_err_log_entries": "0"
|
||||
}`
|
||||
var log nvmeSmartLog
|
||||
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
||||
t.Fatalf("json.Unmarshal failed: %v", err)
|
||||
}
|
||||
if log.PowerOnHours != 49 {
|
||||
t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours)
|
||||
}
|
||||
if log.PowerCycles != 53 {
|
||||
t.Errorf("PowerCycles=%d want 53", log.PowerCycles)
|
||||
}
|
||||
if log.AvailableSpare != 100 {
|
||||
t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare)
|
||||
}
|
||||
if log.SpareThreshold != 5 {
|
||||
t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold)
|
||||
}
|
||||
if log.PercentageUsed != 0 {
|
||||
t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed)
|
||||
}
|
||||
if log.Temperature != 310 {
|
||||
t.Errorf("Temperature=%d want 310", log.Temperature)
|
||||
}
|
||||
if log.MediaErrors != 0 {
|
||||
t.Errorf("MediaErrors=%d want 0", log.MediaErrors)
|
||||
}
|
||||
if log.UnsafeShutdowns != 22 {
|
||||
t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetStorageHealthStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
101
audit/internal/collector/storage_scsi_test.go
Normal file
101
audit/internal/collector/storage_scsi_test.go
Normal file
@@ -0,0 +1,101 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestApplySCSISmartctlTelemetry(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := map[string]any{
|
||||
"power_on_time": map[string]any{
|
||||
"hours": float64(32123),
|
||||
},
|
||||
"accumulated_start_stop_cycles": float64(17),
|
||||
"scsi_grown_defect_list": float64(4),
|
||||
"percentage_used_endurance_indicator": float64(12),
|
||||
"logical_block_size": float64(4096),
|
||||
"logical_blocks_written": float64(1000),
|
||||
"logical_blocks_read": float64(2000),
|
||||
}
|
||||
|
||||
var disk schema.HardwareStorage
|
||||
status := storageHealthStatus{}
|
||||
applySCSISmartctlTelemetry(&disk, raw, &status)
|
||||
|
||||
if disk.PowerOnHours == nil || *disk.PowerOnHours != 32123 {
|
||||
t.Fatalf("power_on_hours=%v want 32123", disk.PowerOnHours)
|
||||
}
|
||||
if disk.PowerCycles == nil || *disk.PowerCycles != 17 {
|
||||
t.Fatalf("power_cycles=%v want 17", disk.PowerCycles)
|
||||
}
|
||||
if disk.ReallocatedSectors == nil || *disk.ReallocatedSectors != 4 {
|
||||
t.Fatalf("reallocated=%v want 4", disk.ReallocatedSectors)
|
||||
}
|
||||
if disk.WrittenBytes == nil || *disk.WrittenBytes != 4096000 {
|
||||
t.Fatalf("written_bytes=%v want 4096000", disk.WrittenBytes)
|
||||
}
|
||||
if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
|
||||
t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
|
||||
}
|
||||
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 4096 {
|
||||
t.Fatalf("logical_block_size_bytes=%v want 4096", disk.LogicalBlockSizeBytes)
|
||||
}
|
||||
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||
}
|
||||
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
|
||||
t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
|
||||
}
|
||||
if disk.LifeRemainingPct == nil || *disk.LifeRemainingPct != 88 {
|
||||
t.Fatalf("life_remaining_pct=%v want 88", disk.LifeRemainingPct)
|
||||
}
|
||||
if status.reallocatedSectors != 4 {
|
||||
t.Fatalf("status.reallocated=%d want 4", status.reallocatedSectors)
|
||||
}
|
||||
if status.lifeRemainingPct != 88 {
|
||||
t.Fatalf("status.life_remaining_pct=%d want 88", status.lifeRemainingPct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
powerOnHours := int64(10)
|
||||
writtenBytes := int64(20)
|
||||
lifeRemaining := 30.0
|
||||
disk := schema.HardwareStorage{
|
||||
PowerOnHours: &powerOnHours,
|
||||
WrittenBytes: &writtenBytes,
|
||||
LifeRemainingPct: &lifeRemaining,
|
||||
}
|
||||
raw := map[string]any{
|
||||
"power_on_time": map[string]any{"hours": float64(999)},
|
||||
"logical_block_size": float64(512),
|
||||
"logical_blocks_written": float64(999),
|
||||
"percentage_used_endurance_indicator": float64(50),
|
||||
}
|
||||
|
||||
applySCSISmartctlTelemetry(&disk, raw, nil)
|
||||
|
||||
if *disk.PowerOnHours != 10 {
|
||||
t.Fatalf("power_on_hours overwritten: got %d want 10", *disk.PowerOnHours)
|
||||
}
|
||||
if *disk.WrittenBytes != 20 {
|
||||
t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
|
||||
}
|
||||
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 512 {
|
||||
t.Fatalf("logical_block_size_bytes=%v want 512", disk.LogicalBlockSizeBytes)
|
||||
}
|
||||
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||
}
|
||||
if *disk.LifeRemainingPct != 30 {
|
||||
t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
|
||||
}
|
||||
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 50 {
|
||||
t.Fatalf("life_used_pct=%v want 50", disk.LifeUsedPct)
|
||||
}
|
||||
}
|
||||
25
audit/internal/collector/storage_telemetry_test.go
Normal file
25
audit/internal/collector/storage_telemetry_test.go
Normal file
@@ -0,0 +1,25 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestSmartLBAsToBytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
lbas int64
|
||||
want int64
|
||||
}{
|
||||
{name: "zero", lbas: 0, want: 0},
|
||||
{name: "single lba", lbas: 1, want: 512},
|
||||
{name: "multiple lbas", lbas: 2048, want: 1048576},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := smartLBAsToBytes(tt.lbas); got != tt.want {
|
||||
t.Fatalf("smartLBAsToBytes(%d)=%d want %d", tt.lbas, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
27
audit/internal/collector/testdata/dmidecode_type0_dell.txt
vendored
Normal file
27
audit/internal/collector/testdata/dmidecode_type0_dell.txt
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0000, DMI type 0, 26 bytes
|
||||
BIOS Information
|
||||
Vendor: Dell Inc.
|
||||
Version: 2.5.4
|
||||
Release Date: 01/13/2020
|
||||
Address: 0xF0000
|
||||
Runtime Size: 64 kB
|
||||
ROM Size: 32 MB
|
||||
Characteristics:
|
||||
ISA is supported
|
||||
PCI is supported
|
||||
PNP is supported
|
||||
BIOS is upgradeable
|
||||
BIOS shadowing is allowed
|
||||
Boot from CD is supported
|
||||
Selectable boot is supported
|
||||
EDD is supported
|
||||
ACPI is supported
|
||||
USB legacy is supported
|
||||
BIOS boot specification is supported
|
||||
Targeted content distribution is supported
|
||||
UEFI is supported
|
||||
BIOS Revision: 2.5
|
||||
59
audit/internal/collector/testdata/dmidecode_type17_mixed.txt
vendored
Normal file
59
audit/internal/collector/testdata/dmidecode_type17_mixed.txt
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0026, DMI type 17, 40 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: 72 bits
|
||||
Data Width: 64 bits
|
||||
Size: 16 GB
|
||||
Form Factor: DIMM
|
||||
Set: None
|
||||
Locator: P1-DIMMA1
|
||||
Bank Locator: P0_Node0_Channel0_Dimm0
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous
|
||||
Speed: 2133 MT/s
|
||||
Manufacturer: Micron
|
||||
Serial Number: 1A2B3C4D
|
||||
Asset Tag: Not Specified
|
||||
Part Number: 36ASF2G72PZ-2G1A2
|
||||
Rank: 2
|
||||
Configured Memory Speed: 2133 MT/s
|
||||
|
||||
Handle 0x0027, DMI type 17, 40 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: Unknown
|
||||
Data Width: Unknown
|
||||
Size: No Module Installed
|
||||
Form Factor: DIMM
|
||||
Set: None
|
||||
Locator: P1-DIMMA2
|
||||
Bank Locator: P0_Node0_Channel0_Dimm1
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous
|
||||
|
||||
Handle 0x0028, DMI type 17, 84 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: 72 bits
|
||||
Data Width: 64 bits
|
||||
Size: 32768 MB
|
||||
Form Factor: DIMM
|
||||
Set: 1
|
||||
Locator: A1
|
||||
Bank Locator: Not Specified
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous Registered (Buffered)
|
||||
Speed: 2933 MT/s
|
||||
Manufacturer: Samsung
|
||||
Serial Number: 5E6F7A8B
|
||||
Asset Tag: Not Specified
|
||||
Part Number: M393A4K40CB2-CVF
|
||||
Rank: 2
|
||||
Configured Memory Speed: 2400 MT/s
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_dell.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_dell.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0100, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: Dell Inc.
|
||||
Product Name: PowerEdge R740xd
|
||||
Version: Not Specified
|
||||
Serial Number: 7SG9F63
|
||||
UUID: b1c2d3e4-f5a6-7890-bcde-f12345678901
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: SKU=NotProvided;ModelName=PowerEdge R740xd
|
||||
Family: PowerEdge
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_hpe.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_hpe.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.3
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x008E, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: HPE
|
||||
Product Name: ProLiant DL380 Gen10
|
||||
Version: Not Specified
|
||||
Serial Number: CZJ9320CXN
|
||||
UUID: c2d3e4f5-a6b7-8901-cdef-012345678902
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: 868703-B21
|
||||
Family: ProLiant
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_supermicro.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_supermicro.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0001, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: Supermicro
|
||||
Product Name: SYS-6028R-WTR
|
||||
Version: 0123456789
|
||||
Serial Number: S214726X2A36789
|
||||
UUID: d3e4f5a6-b7c8-9012-def0-123456789003
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: Default string
|
||||
Family: Default string
|
||||
10
audit/internal/collector/testdata/dmidecode_type2_dell.txt
vendored
Normal file
10
audit/internal/collector/testdata/dmidecode_type2_dell.txt
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0200, DMI type 2, 8 bytes
|
||||
Base Board Information
|
||||
Manufacturer: Dell Inc.
|
||||
Product Name: 0F9N89
|
||||
Version: A00
|
||||
Serial Number: 7SG9F63
|
||||
19
audit/internal/collector/testdata/dmidecode_type2_hpe.txt
vendored
Normal file
19
audit/internal/collector/testdata/dmidecode_type2_hpe.txt
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
# dmidecode 3.3
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x00A4, DMI type 2, 15 bytes
|
||||
Base Board Information
|
||||
Manufacturer: HPE
|
||||
Product Name: ProLiant DL380 Gen10
|
||||
Version: Not Specified
|
||||
Serial Number: CZJ9320CXN
|
||||
Asset Tag: CZJ9320CXN
|
||||
Features:
|
||||
Board is a hosting board
|
||||
Board is removable
|
||||
Board is replaceable
|
||||
Location In Chassis: Not Specified
|
||||
Chassis Handle: 0x0000
|
||||
Type: Motherboard
|
||||
Contained Object Handles: 0
|
||||
18
audit/internal/collector/testdata/dmidecode_type2_supermicro.txt
vendored
Normal file
18
audit/internal/collector/testdata/dmidecode_type2_supermicro.txt
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0002, DMI type 2, 15 bytes
|
||||
Base Board Information
|
||||
Manufacturer: Supermicro
|
||||
Product Name: X10DRW-i
|
||||
Version: 1.02
|
||||
Serial Number: S214726X2A36789
|
||||
Asset Tag: Default string
|
||||
Features:
|
||||
Board is a hosting board
|
||||
Board is replaceable
|
||||
Location In Chassis: Default string
|
||||
Chassis Handle: 0x0003
|
||||
Type: Motherboard
|
||||
Contained Object Handles: 0
|
||||
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMDAdmPlatformLicense(t *testing.T) {
|
||||
premium := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.3.0.1138
|
||||
RAID Levels : raid0 raid1 raid5 raid10
|
||||
Total Disks : 4
|
||||
License : Premium
|
||||
`
|
||||
got := parseMDAdmPlatformLicense(premium)
|
||||
if got == nil || *got != "premium" {
|
||||
t.Fatalf("expected 'premium', got %v", got)
|
||||
}
|
||||
|
||||
standard := `Platform : Intel(R) Virtual RAID on CPU
|
||||
License : Standard
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(standard)
|
||||
if got == nil || *got != "standard" {
|
||||
t.Fatalf("expected 'standard', got %v", got)
|
||||
}
|
||||
|
||||
noLicense := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.0.0
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(noLicense)
|
||||
if got != nil {
|
||||
t.Fatalf("expected nil, got %v", *got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasVROCController(t *testing.T) {
|
||||
intel := vendorIntel
|
||||
model := "Volume Management Device NVMe RAID Controller"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
@@ -0,0 +1,735 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
benchmarkPowerAutotuneVersion = 1
|
||||
benchmarkPowerAutotuneIdleSec = 60
|
||||
benchmarkPowerAutotuneLoadSec = 90
|
||||
benchmarkPowerAutotuneSampleInterval = 3
|
||||
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
|
||||
)
|
||||
|
||||
func BenchmarkPowerSourceConfigPath(baseDir string) string {
|
||||
baseDir = strings.TrimSpace(baseDir)
|
||||
if baseDir == "" {
|
||||
return defaultBenchmarkPowerSourceConfigPath
|
||||
}
|
||||
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
|
||||
}
|
||||
|
||||
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var cfg BenchmarkPowerAutotuneConfig
|
||||
if err := json.Unmarshal(raw, &cfg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(cfg.SelectedSource) == "" {
|
||||
return nil, fmt.Errorf("autotune config missing selected_source")
|
||||
}
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return fmt.Errorf("empty autotune config path")
|
||||
}
|
||||
if cfg.Version <= 0 {
|
||||
cfg.Version = benchmarkPowerAutotuneVersion
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := json.MarshalIndent(cfg, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, path)
|
||||
}
|
||||
|
||||
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
|
||||
}
|
||||
|
||||
func ResetBenchmarkPowerAutotuneConfig(path string) error {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return fmt.Errorf("empty autotune config path")
|
||||
}
|
||||
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func normalizeBenchmarkPowerSource(source string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(source)) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
return BenchmarkPowerSourceSDRPSUInput
|
||||
default:
|
||||
return BenchmarkPowerSourceDCMI
|
||||
}
|
||||
}
|
||||
|
||||
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
|
||||
cfg, err := LoadSystemPowerSourceConfig(exportDir)
|
||||
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
|
||||
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: true,
|
||||
SelectedSource: selected,
|
||||
EffectiveSource: selected,
|
||||
Mode: "autotuned",
|
||||
Reason: strings.TrimSpace(cfg.Reason),
|
||||
ConfiguredAt: cfg.UpdatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
sources := sampleBenchmarkPowerSources()
|
||||
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: false,
|
||||
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
|
||||
Mode: "fallback",
|
||||
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
|
||||
}
|
||||
}
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: false,
|
||||
EffectiveSource: BenchmarkPowerSourceDCMI,
|
||||
Mode: "fallback",
|
||||
Reason: "autotune config not found; using temporary fallback source dcmi",
|
||||
}
|
||||
}
|
||||
|
||||
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
|
||||
decision := ResolveSystemPowerDecision(exportDir)
|
||||
if decision.EffectiveSource != "" {
|
||||
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
|
||||
return value, decision, nil
|
||||
} else if decision.Configured {
|
||||
fallback := BenchmarkPowerSourceDCMI
|
||||
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
|
||||
fallback = BenchmarkPowerSourceSDRPSUInput
|
||||
}
|
||||
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
|
||||
decision.Mode = "degraded"
|
||||
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
|
||||
decision.EffectiveSource = fallback
|
||||
return fallbackValue, decision, nil
|
||||
}
|
||||
decision.Mode = "degraded"
|
||||
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
|
||||
return 0, decision, err
|
||||
}
|
||||
}
|
||||
return 0, decision, fmt.Errorf("system power source unavailable")
|
||||
}
|
||||
|
||||
func queryBenchmarkPowerSourceW(source string) (float64, error) {
|
||||
switch normalizeBenchmarkPowerSource(source) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
sdr := sampleIPMISDRPowerSensors()
|
||||
if sdr.PSUInW > 0 {
|
||||
return sdr.PSUInW, nil
|
||||
}
|
||||
return 0, fmt.Errorf("sdr psu input unavailable")
|
||||
default:
|
||||
return queryIPMIServerPowerW()
|
||||
}
|
||||
}
|
||||
|
||||
func sampleBenchmarkPowerSources() map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
|
||||
out[BenchmarkPowerSourceDCMI] = w
|
||||
}
|
||||
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
|
||||
out[BenchmarkPowerSourceSDRPSUInput] = w
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
|
||||
if durationSec <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
|
||||
if durationSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||
}
|
||||
close(stopCh)
|
||||
return <-doneCh
|
||||
}
|
||||
|
||||
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
|
||||
if intervalSec <= 0 {
|
||||
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||
}
|
||||
ch := make(chan []float64, 1)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
var samples []float64
|
||||
record := func() {
|
||||
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
}
|
||||
record()
|
||||
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
ch <- samples
|
||||
return
|
||||
case <-ticker.C:
|
||||
record()
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
type benchmarkPowerAutotuneSample struct {
|
||||
ElapsedSec float64
|
||||
GPUAvgUsagePct float64
|
||||
CPUUsagePct float64
|
||||
GPUSumPowerW float64
|
||||
Sources map[string]float64
|
||||
}
|
||||
|
||||
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
|
||||
if durationSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
var out []benchmarkPowerAutotuneSample
|
||||
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||
start := time.Now()
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
return out
|
||||
}
|
||||
row := benchmarkPowerAutotuneSample{
|
||||
ElapsedSec: time.Since(start).Seconds(),
|
||||
CPUUsagePct: sampleCPULoadPct(),
|
||||
Sources: sampleBenchmarkPowerSources(),
|
||||
}
|
||||
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
|
||||
var usageSum float64
|
||||
for _, gpu := range gpuRows {
|
||||
row.GPUSumPowerW += gpu.PowerW
|
||||
usageSum += gpu.UsagePct
|
||||
}
|
||||
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
|
||||
}
|
||||
out = append(out, row)
|
||||
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
|
||||
if time.Now().After(deadline) {
|
||||
return out
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return out
|
||||
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||
if logFunc == nil {
|
||||
return
|
||||
}
|
||||
var sourceParts []string
|
||||
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||
if value, ok := sample.Sources[source]; ok && value > 0 {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
|
||||
} else {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
|
||||
}
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
|
||||
phase,
|
||||
sample.ElapsedSec,
|
||||
sample.GPUAvgUsagePct,
|
||||
sample.GPUSumPowerW,
|
||||
sample.CPUUsagePct,
|
||||
strings.Join(sourceParts, " "),
|
||||
))
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||
if logFunc == nil || len(samples) == 0 {
|
||||
return
|
||||
}
|
||||
var gpuUsage []float64
|
||||
var cpuUsage []float64
|
||||
var gpuPower []float64
|
||||
sourceBuckets := map[string][]float64{}
|
||||
for _, sample := range samples {
|
||||
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||
gpuPower = append(gpuPower, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
sourceBuckets[source] = append(sourceBuckets[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
var sourceParts []string
|
||||
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||
values := sourceBuckets[source]
|
||||
if len(values) == 0 {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
|
||||
continue
|
||||
}
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
|
||||
phase,
|
||||
len(samples),
|
||||
benchmarkMean(gpuUsage),
|
||||
benchmarkPercentile(gpuUsage, 95),
|
||||
benchmarkMean(gpuPower),
|
||||
benchmarkMean(cpuUsage),
|
||||
benchmarkPercentile(cpuUsage, 95),
|
||||
strings.Join(sourceParts, " "),
|
||||
))
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
|
||||
if logFunc == nil {
|
||||
return
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
if !candidate.Available {
|
||||
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
|
||||
continue
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
|
||||
candidate.Source,
|
||||
candidate.IdleAvgW,
|
||||
candidate.LoadAvgW,
|
||||
candidate.DeltaW,
|
||||
gpuDelta,
|
||||
candidate.RelativeError,
|
||||
candidate.Confidence*100,
|
||||
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
|
||||
))
|
||||
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
|
||||
result := &BenchmarkPowerAutotuneValidation{}
|
||||
if len(samples) == 0 {
|
||||
result.Reason = "no idle telemetry samples collected"
|
||||
return result
|
||||
}
|
||||
var gpuUsage []float64
|
||||
var cpuUsage []float64
|
||||
for _, sample := range samples {
|
||||
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||
if sample.CPUUsagePct > 0 {
|
||||
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||
}
|
||||
}
|
||||
result.GPUSamples = len(gpuUsage)
|
||||
result.CPUSamples = len(cpuUsage)
|
||||
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
|
||||
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
|
||||
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
|
||||
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
|
||||
switch {
|
||||
case result.GPUAvgUsagePct > 5:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
|
||||
case result.GPUP95UsagePct > 10:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
|
||||
case result.CPUAvgUsagePct > 20:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
|
||||
case result.CPUP95UsagePct > 35:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
|
||||
default:
|
||||
result.Valid = true
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
|
||||
idleBySource := map[string][]float64{}
|
||||
loadBySource := map[string][]float64{}
|
||||
var idleGPU []float64
|
||||
var loadGPU []float64
|
||||
for _, sample := range idle {
|
||||
idleGPU = append(idleGPU, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
idleBySource[source] = append(idleBySource[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, sample := range load {
|
||||
loadGPU = append(loadGPU, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
loadBySource[source] = append(loadBySource[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
idleGPUAvg := benchmarkMean(idleGPU)
|
||||
loadGPUAvg := benchmarkMean(loadGPU)
|
||||
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||
if gpuDelta <= 0 {
|
||||
gpuDelta = loadGPUAvg
|
||||
}
|
||||
|
||||
candidates := []BenchmarkPowerAutotuneCandidate{
|
||||
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
|
||||
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
|
||||
}
|
||||
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
if candidate.Available && candidate.DeltaW > 0 {
|
||||
available = append(available, candidate)
|
||||
}
|
||||
}
|
||||
if len(available) == 0 {
|
||||
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
|
||||
}
|
||||
sort.Slice(available, func(i, j int) bool {
|
||||
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
|
||||
if available[i].Source != available[j].Source {
|
||||
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
|
||||
}
|
||||
}
|
||||
if available[i].RelativeError != available[j].RelativeError {
|
||||
return available[i].RelativeError < available[j].RelativeError
|
||||
}
|
||||
return available[i].Samples > available[j].Samples
|
||||
})
|
||||
selected := available[0]
|
||||
for idx := range candidates {
|
||||
if candidates[idx].Source == selected.Source {
|
||||
candidates[idx].Selected = true
|
||||
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
|
||||
}
|
||||
}
|
||||
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
|
||||
}
|
||||
|
||||
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
|
||||
candidate := BenchmarkPowerAutotuneCandidate{
|
||||
Source: source,
|
||||
Available: len(idle) > 0 && len(load) > 0,
|
||||
Samples: minInt(len(idle), len(load)),
|
||||
}
|
||||
if !candidate.Available {
|
||||
return candidate
|
||||
}
|
||||
candidate.IdleAvgW = benchmarkMean(idle)
|
||||
candidate.LoadAvgW = benchmarkMean(load)
|
||||
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
|
||||
if gpuDelta > 0 {
|
||||
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
|
||||
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
|
||||
}
|
||||
return candidate
|
||||
}
|
||||
|
||||
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "status=%s\n", result.Status)
|
||||
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
|
||||
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
|
||||
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
|
||||
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
|
||||
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
|
||||
if result.SelectedSource != "" {
|
||||
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
|
||||
}
|
||||
if result.IdleValidation != nil {
|
||||
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
|
||||
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
|
||||
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
|
||||
if result.IdleValidation.Reason != "" {
|
||||
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
|
||||
}
|
||||
}
|
||||
for _, candidate := range result.Candidates {
|
||||
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
|
||||
if candidate.Available {
|
||||
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
|
||||
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
|
||||
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
|
||||
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
|
||||
var b strings.Builder
|
||||
b.WriteString("# Bee Bench Power Source Autotune\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
|
||||
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
|
||||
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
|
||||
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
|
||||
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
|
||||
if result.SelectedSource != "" {
|
||||
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
if result.IdleValidation != nil {
|
||||
b.WriteString("## Idle Validation\n\n")
|
||||
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
|
||||
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
|
||||
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
|
||||
if result.IdleValidation.Reason != "" {
|
||||
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.Candidates) > 0 {
|
||||
b.WriteString("## Candidates\n\n")
|
||||
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
|
||||
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
|
||||
for _, candidate := range result.Candidates {
|
||||
if !candidate.Available {
|
||||
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
|
||||
continue
|
||||
}
|
||||
selected := "no"
|
||||
if candidate.Selected {
|
||||
selected = "yes"
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
|
||||
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
|
||||
allDevices := joinIndexList(gpuIndices)
|
||||
switch strings.TrimSpace(strings.ToLower(kind)) {
|
||||
case "power-fit", "power", "nvidia-bench-power":
|
||||
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
|
||||
if err == nil {
|
||||
return cmd, "power-fit"
|
||||
}
|
||||
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
|
||||
default:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", fmt.Sprintf("%d", durationSec),
|
||||
"--devices", allDevices,
|
||||
}
|
||||
if sizeMB > 0 {
|
||||
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
|
||||
}
|
||||
return cmd, "performance"
|
||||
}
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = "/var/log/bee-bench/autotune"
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
selected, err := resolveNvidiaGPUSelection(nil, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "autotune-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
hostname, _ := os.Hostname()
|
||||
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
|
||||
result := BenchmarkPowerAutotuneResult{
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Hostname: hostname,
|
||||
ServerModel: readServerModel(),
|
||||
BenchmarkKind: normalizedKind,
|
||||
Profile: opts.Profile,
|
||||
Status: "FAILED",
|
||||
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
|
||||
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
|
||||
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
|
||||
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
|
||||
if result.IdleValidation == nil || !result.IdleValidation.Valid {
|
||||
if result.IdleValidation != nil {
|
||||
result.IdleValidationError = result.IdleValidation.Reason
|
||||
logFunc(result.IdleValidation.Reason)
|
||||
}
|
||||
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, fmt.Errorf("%s", result.IdleValidationError)
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
|
||||
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
|
||||
go func() {
|
||||
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
|
||||
}()
|
||||
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
|
||||
loadSamples := <-loadSamplesCh
|
||||
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
|
||||
if runErr != nil {
|
||||
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
|
||||
}
|
||||
|
||||
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
|
||||
result.Candidates = candidates
|
||||
result.GPUPowerIdleW = idleGPUAvg
|
||||
result.GPUPowerLoadW = loadGPUAvg
|
||||
if chooseErr != nil {
|
||||
result.Notes = append(result.Notes, chooseErr.Error())
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, chooseErr
|
||||
}
|
||||
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||
if gpuDelta <= 0 {
|
||||
gpuDelta = loadGPUAvg
|
||||
}
|
||||
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
|
||||
result.SelectedSource = selectedSource
|
||||
result.Status = "OK"
|
||||
var confidence float64
|
||||
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
|
||||
for _, candidate := range candidates {
|
||||
if candidate.Selected {
|
||||
confidence = candidate.Confidence
|
||||
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||
selectionReason = candidate.SelectionNotes
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
cfg := BenchmarkPowerAutotuneConfig{
|
||||
Version: benchmarkPowerAutotuneVersion,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
SelectedSource: selectedSource,
|
||||
BenchmarkKind: normalizedKind,
|
||||
Profile: opts.Profile,
|
||||
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
Confidence: confidence,
|
||||
Reason: selectionReason,
|
||||
}
|
||||
result.Config = &cfg
|
||||
configPath := BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
|
||||
result.Status = "FAILED"
|
||||
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
|
||||
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
return runDir, err
|
||||
}
|
||||
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
|
||||
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal autotune result: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||
return fmt.Errorf("write autotune result.json: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
|
||||
return fmt.Errorf("write autotune summary.txt: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
|
||||
return fmt.Errorf("write autotune report.md: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func minInt(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
var _ = exec.ErrNotFound
|
||||
@@ -48,7 +48,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||
}
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if result.ScalabilityScore > 0 {
|
||||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||
}
|
||||
if result.PlatformPowerScore > 0 {
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||||
}
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
b.WriteString("\n")
|
||||
|
||||
@@ -81,69 +84,164 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Methodology ───────────────────────────────────────────────────────────
|
||||
b.WriteString("## Methodology\n\n")
|
||||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
|
||||
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
|
||||
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
|
||||
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
|
||||
b.WriteString("**Compute score** is derived from two phases:\n\n")
|
||||
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
|
||||
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
|
||||
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
|
||||
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
|
||||
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
|
||||
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
|
||||
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
|
||||
b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
|
||||
b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
|
||||
b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
|
||||
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||||
b.WriteString("## Balanced Scorecard\n\n")
|
||||
|
||||
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||
b.WriteString("## Scorecard\n\n")
|
||||
b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||
b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown GPU"
|
||||
// Perspective 1: Compatibility — hard stops
|
||||
b.WriteString("### 1. Compatibility\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
thermalThrottle := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
fanAtThrottle := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
ecc := "-"
|
||||
if gpu.ECC.Uncorrected > 0 {
|
||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||
}
|
||||
compatStatus := "✓ OK"
|
||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||
compatStatus = "⛔ HARD STOP"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||||
}
|
||||
interconnect := "-"
|
||||
if gpu.Scores.InterconnectScore > 0 {
|
||||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
}
|
||||
mixed := "-"
|
||||
if gpu.Scores.MixedScore > 0 {
|
||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||
}
|
||||
mixedEff := "-"
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||
gpu.Index, name,
|
||||
gpu.Status,
|
||||
gpu.Scores.CompositeScore,
|
||||
gpu.Scores.ComputeScore,
|
||||
synthetic,
|
||||
mixed,
|
||||
mixedEff,
|
||||
topsPerSM,
|
||||
gpu.Scores.PowerSustainScore,
|
||||
gpu.Scores.ThermalSustainScore,
|
||||
gpu.Scores.StabilityScore,
|
||||
interconnect,
|
||||
)
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 2: Thermal headroom
|
||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
switch {
|
||||
case headroom < 10:
|
||||
thermalStatus = "⛔ CRITICAL"
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
thermalStatus = "⚠ WARNING"
|
||||
}
|
||||
throttlePct := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||||
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||||
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||||
fmt.Sprintf("%.1f°C", headroom),
|
||||
throttlePct,
|
||||
thermalStatus,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 3: Power delivery
|
||||
b.WriteString("### 3. Power Delivery\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
powerCap := "-"
|
||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||
}
|
||||
fanDuty := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
powerStatus := "✓ OK"
|
||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||
powerStatus = "⚠ POWER LIMITED"
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
powerCap,
|
||||
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||||
fanDuty,
|
||||
powerStatus,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 4: Performance
|
||||
b.WriteString("### 4. Performance\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
}
|
||||
mixed := "-"
|
||||
if gpu.Scores.MixedScore > 0 {
|
||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||
}
|
||||
mixedEff := "-"
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||||
synthetic, mixed, mixedEff, topsPerSM,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 5: Anomaly flags
|
||||
b.WriteString("### 5. Anomalies\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
eccCorr := "-"
|
||||
if gpu.ECC.Corrected > 0 {
|
||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||
}
|
||||
syncBoost := "-"
|
||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||
}
|
||||
powerVar := "OK"
|
||||
if gpu.Scores.PowerSustainScore < 70 {
|
||||
powerVar = "⚠ unstable"
|
||||
}
|
||||
thermalVar := "OK"
|
||||
if gpu.Scores.ThermalSustainScore < 70 {
|
||||
thermalVar = "⚠ unstable"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||
b.WriteString("## Per-GPU Details\n\n")
|
||||
@@ -170,25 +268,43 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if gpu.PowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||
}
|
||||
if gpu.PowerLimitDerated {
|
||||
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||||
}
|
||||
if gpu.CalibratedPeakPowerW > 0 {
|
||||
if gpu.CalibratedPeakTempC > 0 {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||||
} else {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||||
}
|
||||
}
|
||||
if gpu.LockedGraphicsClockMHz > 0 {
|
||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Steady-state telemetry
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||
b.WriteString("\n")
|
||||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"", "Avg", "P95"},
|
||||
[][]string{
|
||||
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||||
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||||
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||||
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||||
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||
}
|
||||
|
||||
// Per-precision stability phases.
|
||||
if len(gpu.PrecisionSteady) > 0 {
|
||||
b.WriteString("**Per-precision stability:**\n\n")
|
||||
b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
|
||||
var precRows [][]string
|
||||
for _, p := range gpu.PrecisionSteady {
|
||||
eccCorr := "—"
|
||||
eccUncorr := "—"
|
||||
@@ -196,10 +312,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
||||
p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
||||
eccCorr, eccUncorr)
|
||||
status := p.Status
|
||||
if strings.TrimSpace(status) == "" {
|
||||
status = "OK"
|
||||
}
|
||||
precRows = append(precRows, []string{
|
||||
p.Precision, status,
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||||
eccCorr, eccUncorr,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
// Legacy: show combined-window variance.
|
||||
@@ -222,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
// Precision results
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
b.WriteString("**Precision results:**\n\n")
|
||||
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
|
||||
var presRows [][]string
|
||||
for _, p := range gpu.PrecisionResults {
|
||||
if p.Supported {
|
||||
weightStr := fmt.Sprintf("×%.3g", p.Weight)
|
||||
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
|
||||
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
|
||||
presRows = append(presRows, []string{
|
||||
p.Name,
|
||||
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||||
fmt.Sprintf("×%.3g", p.Weight),
|
||||
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||||
fmt.Sprintf("%d", p.Lanes),
|
||||
fmt.Sprintf("%d", p.Iterations),
|
||||
})
|
||||
} else {
|
||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
|
||||
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||||
}
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
@@ -253,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||
if result.Interconnect.Supported {
|
||||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Avg", "Max"},
|
||||
[][]string{
|
||||
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||||
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Interconnect.Notes {
|
||||
@@ -266,20 +401,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||
// ── Server Power ───────────────────────────────────────────────────────────
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server Power (IPMI)\n\n")
|
||||
title := "## Server Power\n\n"
|
||||
if sp.Source != "" {
|
||||
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
|
||||
}
|
||||
b.WriteString(title)
|
||||
if !sp.Available {
|
||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||
b.WriteString("Server power measurement unavailable.\n\n")
|
||||
} else {
|
||||
b.WriteString("| | Value |\n|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||||
spRows := [][]string{
|
||||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||||
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||||
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||||
}
|
||||
if sp.ReportingRatio > 0 {
|
||||
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
@@ -290,19 +431,33 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
}
|
||||
}
|
||||
|
||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||
if len(result.PSUIssues) > 0 {
|
||||
b.WriteString("## PSU Issues\n\n")
|
||||
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||||
for _, issue := range result.PSUIssues {
|
||||
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||
if cooling := result.Cooling; cooling != nil {
|
||||
b.WriteString("## Cooling\n\n")
|
||||
if cooling.Available {
|
||||
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
||||
dutyAvg, dutyP95 := "N/A", "N/A"
|
||||
if cooling.FanDutyCycleAvailable {
|
||||
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
||||
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
||||
} else {
|
||||
b.WriteString("| Average fan duty cycle | N/A |\n")
|
||||
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
||||
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||||
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Value"},
|
||||
[][]string{
|
||||
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||||
{"Average fan duty cycle", dutyAvg},
|
||||
{"P95 fan duty cycle", dutyP95},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||
@@ -315,6 +470,23 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Platform Scalability ──────────────────────────────────────────────────
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||
var scalRows [][]string
|
||||
for _, step := range result.PerformanceRampSteps {
|
||||
scalRows = append(scalRows, []string{
|
||||
fmt.Sprintf("%d", step.StepIndex),
|
||||
joinIndexList(step.GPUIndices),
|
||||
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||||
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||
b.WriteString("## Raw Files\n\n")
|
||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||
@@ -364,6 +536,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||
|
||||
75
audit/internal/platform/benchmark_table.go
Normal file
75
audit/internal/platform/benchmark_table.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// fmtMDTable renders a markdown table with column widths padded so the table
|
||||
// is readable as plain text without a markdown renderer.
|
||||
//
|
||||
// headers contains the column header strings.
|
||||
// rows contains data rows; each row must have the same number of cells as headers.
|
||||
// Cells with fewer entries than headers are treated as empty.
|
||||
func fmtMDTable(headers []string, rows [][]string) string {
|
||||
ncols := len(headers)
|
||||
if ncols == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute max width per column.
|
||||
widths := make([]int, ncols)
|
||||
for i, h := range headers {
|
||||
if len(h) > widths[i] {
|
||||
widths[i] = len(h)
|
||||
}
|
||||
}
|
||||
for _, row := range rows {
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
if len(cell) > widths[i] {
|
||||
widths[i] = len(cell)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header row.
|
||||
b.WriteByte('|')
|
||||
for i, h := range headers {
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(h)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Separator row.
|
||||
b.WriteByte('|')
|
||||
for i := range headers {
|
||||
b.WriteString(strings.Repeat("-", widths[i]+2))
|
||||
b.WriteByte('|')
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Data rows.
|
||||
for _, row := range rows {
|
||||
b.WriteByte('|')
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(cell)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
@@ -1,8 +1,13 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
@@ -16,17 +21,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
{
|
||||
name: "default",
|
||||
profile: "",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "stability",
|
||||
profile: "stability",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "overnight",
|
||||
profile: "overnight",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -41,6 +46,222 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if len(labels) != 5 || len(phases) != 5 {
|
||||
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
|
||||
}
|
||||
if basePhaseSec != 60 {
|
||||
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 300 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
|
||||
}
|
||||
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 300 {
|
||||
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 3600 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 3600 {
|
||||
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 14400 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
phases := []benchmarkPlannedPhase{
|
||||
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
|
||||
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
|
||||
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
|
||||
}
|
||||
rows := []GPUMetricRow{
|
||||
{ElapsedSec: 5},
|
||||
{ElapsedSec: 15},
|
||||
{ElapsedSec: 25},
|
||||
{ElapsedSec: 65},
|
||||
}
|
||||
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
|
||||
if len(got["fp8"]) != 1 {
|
||||
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
|
||||
}
|
||||
if len(got["fp16"]) != 1 {
|
||||
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
|
||||
}
|
||||
if len(got["mixed"]) != 2 {
|
||||
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
wantStatus string
|
||||
}{
|
||||
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
|
||||
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
|
||||
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
|
||||
if got != tc.wantStatus {
|
||||
t.Fatalf("status=%q want %q", got, tc.wantStatus)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
before := BenchmarkThrottleCounters{}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 1000 }
|
||||
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||
t.Fatal("unexpected reset call")
|
||||
return "", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
var logs []string
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||
logs = append(logs, line)
|
||||
})
|
||||
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||
t.Fatalf("logs=%q want substring %q", got, want)
|
||||
}
|
||||
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||
t.Fatalf("failed=%v want [0 2]", failed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
var calls []int
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
calls = append(calls, index)
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if len(failed) != 0 {
|
||||
t.Fatalf("failed=%v want no failures", failed)
|
||||
}
|
||||
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||
t.Fatalf("calls=%v want %s", calls, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
if index == 5 {
|
||||
return "busy\n", exec.ErrNotFound
|
||||
}
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||
t.Fatalf("failed=%v want %s", failed, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -56,6 +277,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
info benchmarkGPUInfo
|
||||
want int
|
||||
}{
|
||||
{
|
||||
name: "prefers default tdp over current derated limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 600,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 600,
|
||||
},
|
||||
{
|
||||
name: "caps default tdp to reported max limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 700,
|
||||
MaxPowerLimitW: 650,
|
||||
},
|
||||
want: 650,
|
||||
},
|
||||
{
|
||||
name: "falls back to current limit when default missing",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 525,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 525,
|
||||
},
|
||||
{
|
||||
name: "falls back to max limit when only that is known",
|
||||
info: benchmarkGPUInfo{
|
||||
MaxPowerLimitW: 575,
|
||||
},
|
||||
want: 575,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -65,8 +339,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
"[gpu 0] compute_capability=9.0",
|
||||
"[gpu 0] backend=cublasLt",
|
||||
"[gpu 0] duration_s=10",
|
||||
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
|
||||
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||
"[gpu 0] int8_tensor_iterations=80",
|
||||
"[gpu 0] fp16_tensor_iterations=200",
|
||||
"[gpu 0] fp8_e4m3_iterations=50",
|
||||
"[gpu 0] status=OK",
|
||||
@@ -79,15 +355,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
if got.ComputeCapability != "9.0" {
|
||||
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||
}
|
||||
if len(got.Profiles) != 2 {
|
||||
t.Fatalf("profiles=%d want 2", len(got.Profiles))
|
||||
if len(got.Profiles) != 3 {
|
||||
t.Fatalf("profiles=%d want 3", len(got.Profiles))
|
||||
}
|
||||
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||
}
|
||||
if got.Profiles[0].Category != "fp16_bf16" {
|
||||
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
|
||||
}
|
||||
if got.Profiles[1].Category != "fp8" {
|
||||
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||
}
|
||||
if got.Profiles[2].Category != "int8" {
|
||||
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
|
||||
}
|
||||
if got.Profiles[2].Weight != 0.25 {
|
||||
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
@@ -180,12 +465,40 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
|
||||
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
|
||||
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
|
||||
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
|
||||
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
PrecisionResults: []BenchmarkPrecisionResult{
|
||||
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
|
||||
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
})
|
||||
|
||||
if score.SyntheticScore != 100 {
|
||||
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
|
||||
}
|
||||
if score.MixedScore != 50 {
|
||||
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Min Power Limit : 200.00 W
|
||||
Max Power Limit : 600.00 W
|
||||
Default Power Limit : 575.00 W
|
||||
Current Power Limit : 560.00 W
|
||||
Clocks
|
||||
Graphics : 2422 MHz
|
||||
Memory : 12481 MHz
|
||||
@@ -207,7 +520,7 @@ GPU 00000000:4F:00.0
|
||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
@@ -221,25 +534,49 @@ GPU 00000000:4F:00.0
|
||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].PowerLimitW != 560 {
|
||||
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Min Power Limit : 100.00 W
|
||||
Max Power Limit : 900.00 W
|
||||
Max Clocks
|
||||
Graphics : 9999 MHz
|
||||
Memory : 9999 MHz
|
||||
`)
|
||||
// Already populated — must not be overwritten.
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
||||
0: {
|
||||
Index: 0,
|
||||
BusID: "00000000:4E:00.0",
|
||||
MaxGraphicsClockMHz: 2430,
|
||||
MaxMemoryClockMHz: 12481,
|
||||
MinPowerLimitW: 200,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
|
||||
Available bool `json:"available"`
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
@@ -42,40 +43,151 @@ const (
|
||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||
)
|
||||
|
||||
const (
|
||||
BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
|
||||
BenchmarkPowerEngineTargetedPower = "targeted_power"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||
// re-measure from actual task logs and update the constants here.
|
||||
//
|
||||
// Sources:
|
||||
// - BenchmarkEstimatedPerfStandardSec: MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
|
||||
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||
// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
|
||||
const (
|
||||
// Performance Benchmark (bee-gpu-burn).
|
||||
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||
// Sequential per-GPU mode scales approximately linearly.
|
||||
BenchmarkEstimatedPerfStandardSec = 960 // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
|
||||
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||
|
||||
// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
|
||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||
)
|
||||
|
||||
type NvidiaBenchmarkOptions struct {
|
||||
Profile string
|
||||
SizeMB int
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
RunNCCL bool
|
||||
ServerPowerSource string
|
||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||
RampTotal int // total number of ramp-up steps in this run
|
||||
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||
}
|
||||
|
||||
const (
|
||||
BenchmarkPowerSourceDCMI = "dcmi"
|
||||
BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
|
||||
)
|
||||
|
||||
type BenchmarkPowerAutotuneConfig struct {
|
||||
Version int `json:"version"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
SelectedSource string `json:"selected_source"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
Profile string `json:"profile,omitempty"`
|
||||
IdleDurationSec int `json:"idle_duration_sec,omitempty"`
|
||||
LoadDurationSec int `json:"load_duration_sec,omitempty"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||
Confidence float64 `json:"confidence,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type SystemPowerSourceDecision struct {
|
||||
Configured bool `json:"configured"`
|
||||
SelectedSource string `json:"selected_source,omitempty"`
|
||||
EffectiveSource string `json:"effective_source,omitempty"`
|
||||
Mode string `json:"mode,omitempty"` // autotuned, fallback, degraded
|
||||
Reason string `json:"reason,omitempty"`
|
||||
ConfiguredAt time.Time `json:"configured_at,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneResult struct {
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
Profile string `json:"profile,omitempty"`
|
||||
Status string `json:"status"`
|
||||
IdleDurationSec int `json:"idle_duration_sec"`
|
||||
LoadDurationSec int `json:"load_duration_sec"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec"`
|
||||
SelectedSource string `json:"selected_source,omitempty"`
|
||||
IdleValidationError string `json:"idle_validation_error,omitempty"`
|
||||
IdleValidation *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
|
||||
GPUPowerIdleW float64 `json:"gpu_power_idle_w,omitempty"`
|
||||
GPUPowerLoadW float64 `json:"gpu_power_load_w,omitempty"`
|
||||
Candidates []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
Config *BenchmarkPowerAutotuneConfig `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneValidation struct {
|
||||
Valid bool `json:"valid"`
|
||||
GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
|
||||
GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
|
||||
CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
|
||||
CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
|
||||
GPUSamples int `json:"gpu_samples,omitempty"`
|
||||
CPUSamples int `json:"cpu_samples,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneCandidate struct {
|
||||
Source string `json:"source"`
|
||||
IdleAvgW float64 `json:"idle_avg_w,omitempty"`
|
||||
LoadAvgW float64 `json:"load_avg_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
Samples int `json:"samples,omitempty"`
|
||||
RelativeError float64 `json:"relative_error,omitempty"`
|
||||
Confidence float64 `json:"confidence,omitempty"`
|
||||
Selected bool `json:"selected,omitempty"`
|
||||
Available bool `json:"available"`
|
||||
SelectionNotes string `json:"selection_notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
@@ -104,13 +216,22 @@ type BenchmarkGPUResult struct {
|
||||
Backend string `json:"backend,omitempty"`
|
||||
Status string `json:"status"`
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
|
||||
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
|
||||
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
|
||||
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
|
||||
// Fallback: 80°C.
|
||||
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
|
||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||
// dcgmi targeted_power calibration run before the main benchmark.
|
||||
// Used as the reference denominator for PowerSustainScore instead of
|
||||
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
|
||||
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
@@ -119,6 +240,7 @@ type BenchmarkGPUResult struct {
|
||||
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
|
||||
PrecisionFailures []string `json:"precision_failures,omitempty"`
|
||||
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||
// ECC error delta accumulated over the full benchmark (all phases combined).
|
||||
@@ -127,6 +249,9 @@ type BenchmarkGPUResult struct {
|
||||
Scores BenchmarkScorecard `json:"scores"`
|
||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning is non-empty when a thermal throttle event occurred with
|
||||
// a clock drop ≥20% while server fans were not at 100% duty cycle.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkTelemetrySummary struct {
|
||||
@@ -179,7 +304,7 @@ type BenchmarkPrecisionResult struct {
|
||||
Iterations uint64 `json:"iterations,omitempty"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
// Weight is the fp32-equivalence factor for this precision category.
|
||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
|
||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
|
||||
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||
Weight float64 `json:"weight,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
@@ -199,25 +324,87 @@ type BenchmarkScorecard struct {
|
||||
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// StabilityScore: fraction of steady-state time the GPU spent throttling
|
||||
// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
|
||||
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||
|
||||
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||
// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
|
||||
// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
|
||||
TempHeadroomC float64 `json:"temp_headroom_c"`
|
||||
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
// ServerQualityScore (0–100) reflects server infrastructure quality independent
|
||||
// of GPU model. Combines throttle time, power variance, and temp variance.
|
||||
// Use this to compare servers with the same GPU, or to flag a bad server
|
||||
// that throttles an otherwise fast GPU.
|
||||
ServerQualityScore float64 `json:"server_quality_score"`
|
||||
// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
|
||||
// A throttling GPU will score lower here automatically — no quality multiplier.
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
||||
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
||||
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
||||
// over-reporting its power consumption.
|
||||
// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
|
||||
// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
|
||||
// so benchmark and audit data can be correlated by slot.
|
||||
type BenchmarkPSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN)
|
||||
OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power from multiple independent
|
||||
// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
|
||||
// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
|
||||
// covers only a subset of installed PSUs (partial coverage).
|
||||
//
|
||||
// Source legend:
|
||||
// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs
|
||||
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||
type BenchmarkServerPower struct {
|
||||
Available bool `json:"available"`
|
||||
IdleW float64 `json:"idle_w,omitempty"`
|
||||
LoadedW float64 `json:"loaded_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
Available bool `json:"available"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Mode string `json:"mode,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
|
||||
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||
PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"`
|
||||
PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
|
||||
|
||||
// PSU DC output sum — power delivered to server internals after conversion.
|
||||
PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"`
|
||||
PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
|
||||
|
||||
// Per-slot PSU readings at idle and at peak load.
|
||||
// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
|
||||
PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
|
||||
PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
|
||||
|
||||
// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
|
||||
// PCIe slot delivery only (excludes 16-pin connector power).
|
||||
GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
|
||||
|
||||
// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
|
||||
// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
|
||||
DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
|
||||
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||
@@ -225,13 +412,15 @@ type BenchmarkServerPower struct {
|
||||
// type runs at a time the PowerCVPct here is a genuine stability signal.
|
||||
type BenchmarkPrecisionSteadyPhase struct {
|
||||
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
|
||||
Status string `json:"status,omitempty"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
// ECC errors accumulated during this precision phase only.
|
||||
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
|
||||
// Any uncorrected = serious fault triggered by this precision workload.
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkInterconnectResult struct {
|
||||
@@ -245,3 +434,103 @@ type BenchmarkInterconnectResult struct {
|
||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||
// this server under full GPU load. Use for rack power planning.
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||
// actual wall-power draw as seen by the server's power supply.
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the power benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchGPU struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||
// stably with all other GPUs running simultaneously at their own limits.
|
||||
// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
|
||||
// additional derating.
|
||||
StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"`
|
||||
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// Telemetry holds the aggregated stats from the final converged calibration
|
||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||
// Fan state sampled at the end of single-card calibration.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||
NewGPUIndex int `json:"new_gpu_index"`
|
||||
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// PSU slot readings sampled at end of this ramp step.
|
||||
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
|
||||
// Fan state at end of this ramp step.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
|
||||
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
|
||||
}
|
||||
|
||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||
// scalability ramp-up phase of the performance benchmark.
|
||||
type NvidiaPerformanceRampStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||
ScalabilityPct float64 `json:"scalability_pct"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
|
||||
// Severity is warning (not critical): correctable errors are hardware-recovered.
|
||||
{
|
||||
Name: "nvidia-aer-correctable",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
// PCIe AER correctable from the root port — captures the reported device BDF
|
||||
// (second BDF in "pcieport X: AER: Correctable error received: Y").
|
||||
{
|
||||
Name: "pcie-aer-correctable",
|
||||
Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
|
||||
@@ -14,6 +14,8 @@ import (
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
Stage string `json:"stage,omitempty"`
|
||||
StageStartSec float64 `json:"stage_start_sec,omitempty"`
|
||||
StageEndSec float64 `json:"stage_end_sec,omitempty"`
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
@@ -25,6 +27,7 @@ type GPUMetricRow struct {
|
||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
@@ -145,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||
for _, r := range rows {
|
||||
dutyAvail := 0
|
||||
if r.FanDutyCycleAvailable {
|
||||
dutyAvail = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
|
||||
dutyEstimated := 0
|
||||
if r.FanDutyCycleEstimated {
|
||||
dutyEstimated = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
@@ -509,11 +516,22 @@ func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
|
||||
if name == "" {
|
||||
name = "run"
|
||||
}
|
||||
start := row.StageStartSec
|
||||
end := row.StageEndSec
|
||||
if end <= start {
|
||||
start = row.ElapsedSec
|
||||
end = row.ElapsedSec
|
||||
}
|
||||
if len(spans) == 0 || spans[len(spans)-1].Name != name {
|
||||
spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
|
||||
spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
|
||||
continue
|
||||
}
|
||||
spans[len(spans)-1].End = row.ElapsedSec
|
||||
if start < spans[len(spans)-1].Start {
|
||||
spans[len(spans)-1].Start = start
|
||||
}
|
||||
if end > spans[len(spans)-1].End {
|
||||
spans[len(spans)-1].End = end
|
||||
}
|
||||
}
|
||||
for i := range spans {
|
||||
if spans[i].End <= spans[i].Start {
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -18,7 +19,7 @@ type InstallDisk struct {
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
||||
const squashfsGlob = "/run/live/medium/live/*.squashfs"
|
||||
|
||||
// ListInstallDisks returns block devices suitable for installation.
|
||||
// Excludes the current live boot medium but includes USB drives.
|
||||
@@ -176,11 +177,22 @@ func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
func MinInstallBytes() int64 {
|
||||
fi, err := os.Stat(squashfsPath)
|
||||
if err != nil {
|
||||
files, err := filepath.Glob(squashfsGlob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return 0
|
||||
}
|
||||
return fi.Size() * 3 / 2
|
||||
var total int64
|
||||
for _, path := range files {
|
||||
fi, statErr := os.Stat(path)
|
||||
if statErr != nil {
|
||||
continue
|
||||
}
|
||||
total += fi.Size()
|
||||
}
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
return total * 3 / 2
|
||||
}
|
||||
|
||||
// toramActive returns true when the live system was booted with toram.
|
||||
@@ -222,12 +234,10 @@ func DiskWarnings(d InstallDisk) []string {
|
||||
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||
}
|
||||
if toramActive() {
|
||||
sqFi, err := os.Stat(squashfsPath)
|
||||
if err == nil {
|
||||
free := freeMemBytes()
|
||||
if free > 0 && free < sqFi.Size()*2 {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
free := freeMemBytes()
|
||||
min := MinInstallBytes()
|
||||
if free > 0 && min > 0 && free < (min*4/3) {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
}
|
||||
return w
|
||||
|
||||
@@ -11,20 +11,27 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const installToRAMDir = "/dev/shm/bee-live"
|
||||
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
||||
|
||||
var liveMediumSquashfsGlob = func() ([]string, error) {
|
||||
return filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
}
|
||||
|
||||
var runRemountMedium = func() ([]byte, error) {
|
||||
return exec.Command("bee-remount-medium").CombinedOutput()
|
||||
}
|
||||
|
||||
var umountLiveMedium = func() error {
|
||||
return exec.Command("umount", "/run/live/medium").Run()
|
||||
}
|
||||
|
||||
var ejectDevice = func(device string) error {
|
||||
return exec.Command("eject", device).Run()
|
||||
}
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
if fsType == "" {
|
||||
// No medium mount at all — fall back to toram kernel parameter.
|
||||
return toramActive()
|
||||
}
|
||||
if strings.EqualFold(fsType, "tmpfs") {
|
||||
return true
|
||||
}
|
||||
// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
|
||||
// mount of /run/live/medium fails (common for CD-ROM boots), the medium
|
||||
// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
|
||||
files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
|
||||
return len(files) > 0
|
||||
return s.LiveMediaRAMState().InRAM
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
@@ -56,42 +63,163 @@ func (s *System) LiveBootSource() LiveBootSource {
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
|
||||
return evaluateLiveMediaRAMState(
|
||||
s.LiveBootSource(),
|
||||
toramActive(),
|
||||
globPaths("/run/live/medium/live/*.squashfs"),
|
||||
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
|
||||
)
|
||||
}
|
||||
|
||||
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
|
||||
state := LiveMediaRAMState{
|
||||
LiveBootSource: status,
|
||||
ToramActive: toram,
|
||||
CopyPresent: len(copiedSquashfs) > 0,
|
||||
}
|
||||
if status.InRAM {
|
||||
state.State = "in_ram"
|
||||
state.Status = "ok"
|
||||
state.CopyComplete = true
|
||||
state.Message = "Running from RAM — installation media can be safely disconnected."
|
||||
return state
|
||||
}
|
||||
|
||||
expected := pathBaseSet(sourceSquashfs)
|
||||
copied := pathBaseSet(copiedSquashfs)
|
||||
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
|
||||
|
||||
switch {
|
||||
case state.CopyComplete:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
|
||||
case state.CopyPresent:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
|
||||
case toram:
|
||||
state.State = "toram_failed"
|
||||
state.Status = "failed"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
|
||||
default:
|
||||
state.State = "not_in_ram"
|
||||
state.Status = "warning"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
|
||||
}
|
||||
return state
|
||||
}
|
||||
|
||||
func globPaths(pattern string) []string {
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
return matches
|
||||
}
|
||||
|
||||
func pathBaseSet(paths []string) map[string]struct{} {
|
||||
out := make(map[string]struct{}, len(paths))
|
||||
for _, path := range paths {
|
||||
base := strings.TrimSpace(filepath.Base(path))
|
||||
if base != "" {
|
||||
out[base] = struct{}{}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func setContainsAll(have, want map[string]struct{}) bool {
|
||||
if len(want) == 0 {
|
||||
return false
|
||||
}
|
||||
for name := range want {
|
||||
if _, ok := have[name]; !ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
|
||||
log := func(msg string) {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
}
|
||||
|
||||
if s.IsLiveMediaInRAM() {
|
||||
state := s.LiveMediaRAMState()
|
||||
if state.InRAM {
|
||||
log("Already running from RAM — installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
if err != nil || len(squashfsFiles) == 0 {
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||
}
|
||||
squashfsFiles, sourceAvailable := ensureLiveMediumAvailable(log)
|
||||
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
dstDir := installToRAMDir
|
||||
|
||||
// If the source medium is unavailable, check whether a previous run already
|
||||
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||
// directly to the loop-rebind / bind-mount steps.
|
||||
if !sourceAvailable {
|
||||
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(copiedFiles) > 0 {
|
||||
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||
// Proceed to rebind with the already-copied files.
|
||||
for _, dst := range copiedFiles {
|
||||
base := filepath.Base(dst)
|
||||
// Re-associate the loop device that was originally backed by the
|
||||
// source file (now gone); find it by the old source path pattern.
|
||||
srcGuess := "/run/live/medium/live/" + base
|
||||
loopDev, lerr := findLoopForFile(srcGuess)
|
||||
if lerr != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||
continue
|
||||
}
|
||||
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
goto bindMedium
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry (or run bee-remount-medium as root)", dstDir)
|
||||
}
|
||||
|
||||
dstDir := "/dev/shm/bee-live"
|
||||
{
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
}
|
||||
|
||||
if state.CopyPresent {
|
||||
log("Removing stale partial RAM copy before retry...")
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
return
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
log("Removed incomplete RAM copy.")
|
||||
}()
|
||||
|
||||
for _, sf := range squashfsFiles {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -117,6 +245,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
}
|
||||
}
|
||||
|
||||
bindMedium:
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
@@ -140,10 +269,83 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
if status.InRAM {
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
}
|
||||
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||
detachInstallMedium(status, log)
|
||||
log("Done. Squashfs files are in RAM. Installation media has been detached when possible.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func tryRemountLiveMedium(log func(string)) error {
|
||||
output, err := runRemountMedium()
|
||||
trimmed := strings.TrimSpace(string(output))
|
||||
if err != nil {
|
||||
if trimmed != "" && log != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
log("bee-remount-medium: " + line)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
if trimmed != "" && log != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
log("bee-remount-medium: " + line)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func ensureLiveMediumAvailable(log func(string)) ([]string, bool) {
|
||||
squashfsFiles, err := liveMediumSquashfsGlob()
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
if sourceAvailable {
|
||||
return squashfsFiles, true
|
||||
}
|
||||
|
||||
if log != nil {
|
||||
log("Live medium not mounted at /run/live/medium — attempting automatic remount scan...")
|
||||
}
|
||||
if remountErr := tryRemountLiveMedium(log); remountErr != nil {
|
||||
if log != nil {
|
||||
log(fmt.Sprintf("Automatic remount did not restore the live medium: %v", remountErr))
|
||||
}
|
||||
return squashfsFiles, false
|
||||
}
|
||||
|
||||
squashfsFiles, err = liveMediumSquashfsGlob()
|
||||
sourceAvailable = err == nil && len(squashfsFiles) > 0
|
||||
if sourceAvailable && log != nil {
|
||||
log("Live medium restored after remount scan.")
|
||||
}
|
||||
return squashfsFiles, sourceAvailable
|
||||
}
|
||||
|
||||
func detachInstallMedium(status LiveBootSource, log func(string)) {
|
||||
if log == nil {
|
||||
log = func(string) {}
|
||||
}
|
||||
|
||||
log("Detaching original installation medium...")
|
||||
if err := umountLiveMedium(); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not unmount /run/live/medium: %v", err))
|
||||
} else {
|
||||
log("Unmounted /run/live/medium.")
|
||||
}
|
||||
|
||||
device := strings.TrimSpace(status.Device)
|
||||
if device == "" {
|
||||
device = strings.TrimSpace(status.Source)
|
||||
}
|
||||
if device == "" || !strings.HasPrefix(device, "/dev/") {
|
||||
log("No block device identified for eject; skipping media eject.")
|
||||
return
|
||||
}
|
||||
|
||||
if err := ejectDevice(device); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not eject %s: %v", device, err))
|
||||
return
|
||||
}
|
||||
log(fmt.Sprintf("Ejected %s.", device))
|
||||
}
|
||||
|
||||
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||
if status.InRAM {
|
||||
return nil
|
||||
@@ -206,6 +408,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
defer out.Close()
|
||||
total := fi.Size()
|
||||
var copied int64
|
||||
var lastLogged int64
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -217,7 +420,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
return werr
|
||||
}
|
||||
copied += int64(n)
|
||||
if logFunc != nil && total > 0 {
|
||||
if shouldLogCopyProgress(copied, total, lastLogged) {
|
||||
lastLogged = copied
|
||||
pct := int(float64(copied) / float64(total) * 100)
|
||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||
}
|
||||
@@ -232,6 +436,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
|
||||
if total <= 0 || copied <= 0 {
|
||||
return false
|
||||
}
|
||||
if copied >= total {
|
||||
return copied > lastLogged
|
||||
}
|
||||
if copied < copyProgressLogStep {
|
||||
return false
|
||||
}
|
||||
return copied-lastLogged >= copyProgressLogStep
|
||||
}
|
||||
|
||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||
if ctx.Err() != nil {
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -58,3 +61,222 @@ func TestDescribeLiveBootSource(t *testing.T) {
|
||||
t.Fatalf("got %q want /run/live/medium", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("in_ram", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
|
||||
false,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("partial_copy_after_cancel", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
false,
|
||||
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
|
||||
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
|
||||
)
|
||||
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
if state.CopyComplete {
|
||||
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("toram_failed", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
true,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestShouldLogCopyProgress(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
total := int64(250 * 1024 * 1024)
|
||||
step := int64(100 * 1024 * 1024)
|
||||
|
||||
if shouldLogCopyProgress(step-1, total, 0) {
|
||||
t.Fatal("progress logged too early")
|
||||
}
|
||||
if !shouldLogCopyProgress(step, total, 0) {
|
||||
t.Fatal("expected log at first 100MB boundary")
|
||||
}
|
||||
if shouldLogCopyProgress(step+16*1024*1024, total, step) {
|
||||
t.Fatal("progress logged again before next 100MB")
|
||||
}
|
||||
if !shouldLogCopyProgress(2*step, total, step) {
|
||||
t.Fatal("expected log at second 100MB boundary")
|
||||
}
|
||||
if !shouldLogCopyProgress(total, total, 2*step) {
|
||||
t.Fatal("expected final completion log")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTryRemountLiveMedium(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
orig := runRemountMedium
|
||||
t.Cleanup(func() {
|
||||
runRemountMedium = orig
|
||||
})
|
||||
|
||||
t.Run("success", func(t *testing.T) {
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("[10:57:31] Mounted /dev/sr1 on /run/live/medium\n"), nil
|
||||
}
|
||||
var logs []string
|
||||
if err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) }); err != nil {
|
||||
t.Fatalf("tryRemountLiveMedium() error = %v", err)
|
||||
}
|
||||
if len(logs) != 1 || logs[0] != "bee-remount-medium: [10:57:31] Mounted /dev/sr1 on /run/live/medium" {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("failure", func(t *testing.T) {
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("must be run as root\n"), fmt.Errorf("exit status 1")
|
||||
}
|
||||
var logs []string
|
||||
err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) })
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if len(logs) != 1 || logs[0] != "bee-remount-medium: must be run as root" {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestEnsureLiveMediumAvailableRemountsSource(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
origGlob := liveMediumSquashfsGlob
|
||||
origRemount := runRemountMedium
|
||||
t.Cleanup(func() {
|
||||
liveMediumSquashfsGlob = origGlob
|
||||
runRemountMedium = origRemount
|
||||
})
|
||||
|
||||
callCount := 0
|
||||
liveMediumSquashfsGlob = func() ([]string, error) {
|
||||
callCount++
|
||||
if callCount == 1 {
|
||||
return nil, nil
|
||||
}
|
||||
return []string{"/run/live/medium/live/filesystem.squashfs"}, nil
|
||||
}
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("Mounted /dev/sr1 on /run/live/medium\n"), nil
|
||||
}
|
||||
|
||||
var logs []string
|
||||
files, ok := ensureLiveMediumAvailable(func(msg string) { logs = append(logs, msg) })
|
||||
if !ok {
|
||||
t.Fatal("expected live medium to become available after remount")
|
||||
}
|
||||
if callCount < 2 {
|
||||
t.Fatalf("liveMediumSquashfsGlob called %d times, want at least 2", callCount)
|
||||
}
|
||||
if len(files) != 1 || files[0] != "/run/live/medium/live/filesystem.squashfs" {
|
||||
t.Fatalf("files=%v", files)
|
||||
}
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "Live medium restored after remount scan." {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("expected remount success log, logs=%v", logs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetachInstallMedium(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
origUmount := umountLiveMedium
|
||||
origEject := ejectDevice
|
||||
t.Cleanup(func() {
|
||||
umountLiveMedium = origUmount
|
||||
ejectDevice = origEject
|
||||
})
|
||||
|
||||
t.Run("success", func(t *testing.T) {
|
||||
var umountCalled bool
|
||||
var ejected string
|
||||
umountLiveMedium = func() error {
|
||||
umountCalled = true
|
||||
return nil
|
||||
}
|
||||
ejectDevice = func(device string) error {
|
||||
ejected = device
|
||||
return nil
|
||||
}
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "cdrom", Device: "/dev/sr1"}, func(msg string) { logs = append(logs, msg) })
|
||||
if !umountCalled {
|
||||
t.Fatal("expected umountLiveMedium to be called")
|
||||
}
|
||||
if ejected != "/dev/sr1" {
|
||||
t.Fatalf("ejected=%q want /dev/sr1", ejected)
|
||||
}
|
||||
if len(logs) < 3 {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("no device", func(t *testing.T) {
|
||||
umountLiveMedium = func() error { return nil }
|
||||
ejectDevice = func(device string) error {
|
||||
t.Fatalf("unexpected eject for %q", device)
|
||||
return nil
|
||||
}
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "ram", Source: "tmpfs"}, func(msg string) { logs = append(logs, msg) })
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "No block device identified for eject; skipping media eject." {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("eject failure is warning only", func(t *testing.T) {
|
||||
umountLiveMedium = func() error { return nil }
|
||||
ejectDevice = func(device string) error { return fmt.Errorf("exit status 1") }
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "usb", Device: "/dev/sdb1"}, func(msg string) { logs = append(logs, msg) })
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "Warning: could not eject /dev/sdb1: exit status 1" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||
@@ -15,6 +18,7 @@ var workerPatterns = []string{
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
"nvbandwidth",
|
||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||
"nvvs",
|
||||
@@ -30,7 +34,12 @@ type KilledProcess struct {
|
||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||
// SIGKILL to each one found. It returns a list of killed processes.
|
||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||
// The scan runs under a 5-second deadline to avoid blocking if the process
|
||||
// table is very large (e.g. after a stress test with thousands of children).
|
||||
func KillTestWorkers() []KilledProcess {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
entries, err := os.ReadDir("/proc")
|
||||
if err != nil {
|
||||
return nil
|
||||
@@ -38,6 +47,13 @@ func KillTestWorkers() []KilledProcess {
|
||||
|
||||
var killed []KilledProcess
|
||||
for _, e := range entries {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
|
||||
return killed
|
||||
default:
|
||||
}
|
||||
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
@@ -56,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
if shouldKillWorkerProcess(exe, base) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
|
||||
func shouldKillWorkerProcess(exe, base string) bool {
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
39
audit/internal/platform/kill_workers_test.go
Normal file
39
audit/internal/platform/kill_workers_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestShouldKillWorkerProcess(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
exe string
|
||||
base string
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "nvbandwidth executable",
|
||||
exe: "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
|
||||
base: "nvbandwidth",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "dcgmi executable",
|
||||
exe: "/usr/bin/dcgmi",
|
||||
base: "dcgmi",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "unrelated process",
|
||||
exe: "/usr/bin/bash",
|
||||
base: "bash",
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
|
||||
t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,10 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bee/audit/internal/collector"
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
@@ -14,13 +16,24 @@ import (
|
||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||
// collected for the web UI metrics page.
|
||||
type LiveMetricSample struct {
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
PowerSource string `json:"power_source,omitempty"`
|
||||
PowerMode string `json:"power_mode,omitempty"`
|
||||
PowerReason string `json:"power_reason,omitempty"`
|
||||
PSUs []PSUReading `json:"psus,omitempty"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// PSUReading is a per-slot power supply input power reading.
|
||||
type PSUReading struct {
|
||||
Slot int `json:"slot"`
|
||||
Name string `json:"name"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
}
|
||||
|
||||
// TempReading is a named temperature sensor value.
|
||||
@@ -54,8 +67,17 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
}
|
||||
}
|
||||
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||
s.PSUs = samplePSUPower()
|
||||
|
||||
// System power: use the global autotune-selected source when configured,
|
||||
// otherwise fall back to the historical heuristic and mark the mode.
|
||||
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
|
||||
s.PowerW = powerW
|
||||
s.PowerSource = decision.EffectiveSource
|
||||
s.PowerMode = decision.Mode
|
||||
s.PowerReason = decision.Reason
|
||||
}
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
@@ -326,3 +348,46 @@ func compactAmbientTempName(chip, name string) string {
|
||||
}
|
||||
return chip + " / " + name
|
||||
}
|
||||
|
||||
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||
// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
|
||||
// vendors where PSU sensors may not carry entity ID "10.N".
|
||||
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||
func samplePSUPower() []PSUReading {
|
||||
out, err := exec.Command("ipmitool", "sdr").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
slots := collector.PSUSlotsFromSDR(string(out))
|
||||
if len(slots) == 0 {
|
||||
return nil
|
||||
}
|
||||
// Collect slot keys and sort for stable output.
|
||||
keys := make([]int, 0, len(slots))
|
||||
for k := range slots {
|
||||
n, err := strconv.Atoi(k)
|
||||
if err == nil {
|
||||
keys = append(keys, n)
|
||||
}
|
||||
}
|
||||
sort.Ints(keys)
|
||||
psus := make([]PSUReading, 0, len(keys))
|
||||
for _, k := range keys {
|
||||
entry := slots[strconv.Itoa(k)]
|
||||
// Prefer AC input power; fall back to DC output power.
|
||||
var w float64
|
||||
if entry.InputW != nil && *entry.InputW > 0 {
|
||||
w = *entry.InputW
|
||||
} else if entry.OutputW != nil && *entry.OutputW > 0 {
|
||||
w = *entry.OutputW
|
||||
}
|
||||
if w <= 0 {
|
||||
continue
|
||||
}
|
||||
psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
|
||||
}
|
||||
if len(psus) == 0 {
|
||||
return nil
|
||||
}
|
||||
return psus
|
||||
}
|
||||
|
||||
@@ -258,7 +258,7 @@ func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||
func interfaceAdminState(iface string) (bool, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
return false, err
|
||||
return false, fmt.Errorf("ip link show dev %s: %w", iface, err)
|
||||
}
|
||||
return parseInterfaceAdminState(string(raw))
|
||||
}
|
||||
@@ -288,7 +288,7 @@ func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||
if errors.As(err, &exitErr) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("ip addr show dev %s: %w", iface, err)
|
||||
}
|
||||
var ipv4 []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
|
||||
51
audit/internal/platform/nvidia_recover.go
Normal file
51
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,51 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||
|
||||
func runNvidiaRecover(args ...string) (string, error) {
|
||||
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||
cmdArgs := []string{
|
||||
"systemd-run",
|
||||
"--quiet",
|
||||
"--pipe",
|
||||
"--wait",
|
||||
"--collect",
|
||||
"--service-type=oneshot",
|
||||
"--unit", unit,
|
||||
}
|
||||
cmdArgs = append(cmdArgs, helperArgs...)
|
||||
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
func resetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
func restartNvidiaDrivers() (string, error) {
|
||||
out, err := runNvidiaRecover("restart-drivers")
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "NVIDIA drivers restarted.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
|
||||
"bee-audit",
|
||||
"bee-web",
|
||||
"bee-sshsetup",
|
||||
"nvidia-dcgm",
|
||||
"nvidia-fabricmanager",
|
||||
}
|
||||
|
||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
@@ -53,7 +55,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
if err == nil {
|
||||
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||
hasIPv4 := false
|
||||
missingIPv4 := false
|
||||
for _, iface := range interfaces {
|
||||
outcome := "no_offer"
|
||||
if len(iface.IPv4) > 0 {
|
||||
@@ -61,8 +62,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
hasIPv4 = true
|
||||
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||
outcome = "link_down"
|
||||
} else {
|
||||
missingIPv4 = true
|
||||
}
|
||||
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||
Name: iface.Name,
|
||||
@@ -71,17 +70,9 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
Outcome: outcome,
|
||||
})
|
||||
}
|
||||
switch {
|
||||
case hasIPv4 && !missingIPv4:
|
||||
if hasIPv4 {
|
||||
health.NetworkStatus = "OK"
|
||||
case hasIPv4:
|
||||
health.NetworkStatus = "PARTIAL"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_partial",
|
||||
Severity: "warning",
|
||||
Description: "At least one interface did not obtain IPv4 connectivity.",
|
||||
})
|
||||
default:
|
||||
} else {
|
||||
health.NetworkStatus = "FAILED"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_failed",
|
||||
@@ -171,25 +162,28 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||
return ToolStatus{Name: display}
|
||||
}
|
||||
|
||||
// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
|
||||
// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
|
||||
// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
|
||||
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
|
||||
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
|
||||
// incomplete RAM copy exists but runtime still depends on the boot medium,
|
||||
// "failed" = toram was requested but medium is not in RAM.
|
||||
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||
inRAM := s.IsLiveMediaInRAM()
|
||||
active := toramActive()
|
||||
switch {
|
||||
case inRAM:
|
||||
health.ToRAMStatus = "ok"
|
||||
case active:
|
||||
// toram was requested but medium is not yet/no longer in RAM
|
||||
health.ToRAMStatus = "failed"
|
||||
state := s.LiveMediaRAMState()
|
||||
health.ToRAMStatus = state.Status
|
||||
switch state.Status {
|
||||
case "ok":
|
||||
return
|
||||
case "failed":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_failed",
|
||||
Severity: "warning",
|
||||
Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
|
||||
Description: state.Message,
|
||||
})
|
||||
case "partial":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_partial",
|
||||
Severity: "warning",
|
||||
Description: state.Message,
|
||||
})
|
||||
default:
|
||||
health.ToRAMStatus = "warning"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,13 +205,13 @@ func findUSBExportMount() string {
|
||||
|
||||
// fs types that are expected on USB export drives
|
||||
exportFSTypes := map[string]bool{
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"fuseblk": true,
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,54 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||
// production logs in _benchmark/_v8/.
|
||||
//
|
||||
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||
// from actual task logs and update the matching constant here.
|
||||
//
|
||||
// Sources:
|
||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||
// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||
const (
|
||||
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||
SATEstimatedCPUValidateSec = 65
|
||||
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||
SATEstimatedCPUStressSec = 1800
|
||||
|
||||
// RAM: memtester 256 MB / 1 pass.
|
||||
SATEstimatedMemoryValidateSec = 70
|
||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||
SATEstimatedMemoryStressSec = 140
|
||||
|
||||
// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
|
||||
SATEstimatedNvidiaGPUValidateSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
|
||||
SATEstimatedNvidiaGPUStressSec = 450
|
||||
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaTargetedStressSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaTargetedPowerSec = 350
|
||||
|
||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||
SATEstimatedNvidiaPulseTestSec = 5000
|
||||
|
||||
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaInterconnectSec = 300
|
||||
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||
SATEstimatedNvidiaBandwidthSec = 2700
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
@@ -134,9 +182,16 @@ func (s *System) DetectGPUVendor() string {
|
||||
return "amd"
|
||||
}
|
||||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||
text := strings.ToLower(string(raw))
|
||||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||
return "amd"
|
||||
// Only match AMD GPU device classes [0300]=VGA, [0302]=3D controller, [0380]=Display.
|
||||
// AMD CPUs also appear in lspci as "Advanced Micro Devices" (Root Complex, IOMMU, etc.)
|
||||
// so matching vendor alone causes false positives on AMD CPU servers without GPUs.
|
||||
for _, line := range strings.Split(strings.ToLower(string(raw)), "\n") {
|
||||
if !strings.Contains(line, "advanced micro devices") && !strings.Contains(line, "amd/ati") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(line, "[0300]") || strings.Contains(line, "[0302]") || strings.Contains(line, "[0380]") {
|
||||
return "amd"
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
@@ -356,22 +411,17 @@ func normalizeNvidiaBusID(v string) string {
|
||||
}
|
||||
|
||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||
if len(raw) == 0 && err == nil {
|
||||
raw = []byte("GPU reset completed.\n")
|
||||
}
|
||||
return string(raw), err
|
||||
return resetNvidiaGPU(index)
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
gpuCount := len(selected)
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
@@ -380,7 +430,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
@@ -393,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if staggerSec > 0 && len(selected) > 1 {
|
||||
if len(selected) > 1 {
|
||||
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||||
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||||
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||||
// of CUDA_VISIBLE_DEVICES.
|
||||
stagger := staggerSec
|
||||
if stagger < 0 {
|
||||
stagger = 0
|
||||
}
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||
"--stagger-seconds", strconv.Itoa(stagger),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
@@ -426,6 +484,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -443,6 +508,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -460,6 +532,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
@@ -552,9 +631,19 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
|
||||
// intentionally conservative enough for healthy systems while avoiding the
|
||||
// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
|
||||
timeoutSec := sizeMB*passes*20/100 + 60
|
||||
if timeoutSec < 180 {
|
||||
timeoutSec = 180
|
||||
}
|
||||
if timeoutSec > 900 {
|
||||
timeoutSec = 900
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -42,27 +43,56 @@ type GPUStressMetric struct {
|
||||
|
||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||
type FanStressRow struct {
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64
|
||||
SysPowerSource string
|
||||
SysPowerMode string
|
||||
}
|
||||
|
||||
type cachedPowerReading struct {
|
||||
Value float64
|
||||
Source string
|
||||
Mode string
|
||||
Reason string
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type fanObservationState struct {
|
||||
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||
}
|
||||
|
||||
type fanPeakCandidate struct {
|
||||
FirstSeen time.Time
|
||||
RPM float64
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
fanObservationMu sync.Mutex
|
||||
fanObservation fanObservationState
|
||||
fanObservationInit bool
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||
|
||||
const fanObservationMinPeakHold = time.Second
|
||||
|
||||
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||
if rpm <= 0 {
|
||||
return 0
|
||||
}
|
||||
return math.Ceil(rpm/1000.0) * 1000.0
|
||||
}
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
@@ -253,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
|
||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||
row.Fans, _ = sampleFanSpeeds()
|
||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||
row.SysPowerW = sampleSystemPower()
|
||||
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
|
||||
return row
|
||||
}
|
||||
|
||||
@@ -310,11 +340,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err == nil {
|
||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
}
|
||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||
if len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
if err != nil {
|
||||
@@ -323,6 +355,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
return nil, sensorsErr
|
||||
}
|
||||
|
||||
func loadFanObservationLocked() {
|
||||
if fanObservationInit {
|
||||
return
|
||||
}
|
||||
fanObservationInit = true
|
||||
fanObservation.MaxRPM = make(map[string]float64)
|
||||
raw, err := os.ReadFile(fanObservationStatePath)
|
||||
if err != nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
var persisted fanObservationState
|
||||
if json.Unmarshal(raw, &persisted) != nil {
|
||||
return
|
||||
}
|
||||
for name, rpm := range persisted.MaxRPM {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
fanObservation.MaxRPM[name] = rpm
|
||||
}
|
||||
}
|
||||
|
||||
func saveFanObservationLocked() {
|
||||
if len(fanObservation.MaxRPM) == 0 {
|
||||
return
|
||||
}
|
||||
dir := filepath.Dir(fanObservationStatePath)
|
||||
if dir == "" || dir == "." {
|
||||
dir = "/var/log/bee-sat"
|
||||
}
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return
|
||||
}
|
||||
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||
}
|
||||
|
||||
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||
if len(fans) == 0 {
|
||||
return
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
changed := false
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
currentMax := fanObservation.MaxRPM[name]
|
||||
if fan.RPM <= currentMax {
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if cand, ok := fanPeakCandidates[name]; ok {
|
||||
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||
newMax := math.Max(cand.RPM, fan.RPM)
|
||||
if newMax > currentMax {
|
||||
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||
changed = true
|
||||
}
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if fan.RPM > cand.RPM {
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||
}
|
||||
continue
|
||||
}
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||
}
|
||||
if changed {
|
||||
saveFanObservationLocked()
|
||||
}
|
||||
}
|
||||
|
||||
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
var samples []float64
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
maxRPM := fanObservation.MaxRPM[name]
|
||||
if maxRPM <= 0 {
|
||||
continue
|
||||
}
|
||||
pct := fan.RPM / maxRPM * 100.0
|
||||
if pct > 100 {
|
||||
pct = 100
|
||||
}
|
||||
if pct < 0 {
|
||||
pct = 0
|
||||
}
|
||||
samples = append(samples, pct)
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
//
|
||||
@@ -428,12 +573,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
|
||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||
// Returns the average duty cycle across all exposed PWM controls.
|
||||
func sampleFanDutyCyclePct() (float64, bool) {
|
||||
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return 0, false
|
||||
fans, fanErr := sampleFanSpeeds()
|
||||
if fanErr != nil {
|
||||
return 0, false, false
|
||||
}
|
||||
return sampleFanDutyCyclePctFromFans(fans)
|
||||
}
|
||||
return parseFanDutyCyclePctSensorsJSON(out)
|
||||
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||
return pct, ok, false
|
||||
}
|
||||
|
||||
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false, false
|
||||
}
|
||||
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||
return pct, true, true
|
||||
}
|
||||
return 0, false, false
|
||||
}
|
||||
|
||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||
@@ -608,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
// sampleSystemPowerResolved reads system power via the global autotune source,
|
||||
// falling back to the historical heuristic before autotune or when degraded.
|
||||
func sampleSystemPowerResolved() (float64, string, string) {
|
||||
now := time.Now()
|
||||
current := 0.0
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err == nil {
|
||||
current = parseDCMIPowerReading(string(out))
|
||||
}
|
||||
current, decision, err := SampleSystemPowerResolved("")
|
||||
systemPowerCacheMu.Lock()
|
||||
defer systemPowerCacheMu.Unlock()
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||
if err != nil {
|
||||
current = 0
|
||||
}
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
|
||||
systemPowerCache = updated
|
||||
return value
|
||||
return value, updated.Source, updated.Mode
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
@@ -643,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
|
||||
if current > 0 {
|
||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
|
||||
return current, cache
|
||||
}
|
||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldPath := fanObservationStatePath
|
||||
oldState := fanObservation
|
||||
oldInit := fanObservationInit
|
||||
oldCandidates := fanPeakCandidates
|
||||
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
t.Cleanup(func() {
|
||||
fanObservationStatePath = oldPath
|
||||
fanObservation = oldState
|
||||
fanObservationInit = oldInit
|
||||
fanPeakCandidates = oldCandidates
|
||||
})
|
||||
|
||||
start := time.Unix(100, 0)
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||
t.Fatalf("single-sample spike should not establish observed max")
|
||||
}
|
||||
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||
|
||||
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("got=%v want ~43.3", got)
|
||||
}
|
||||
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
@@ -64,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
now := time.Now()
|
||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
|
||||
if got != 480 {
|
||||
t.Fatalf("got=%v want cached 480", got)
|
||||
}
|
||||
@@ -72,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
|
||||
if got != 530 {
|
||||
t.Fatalf("got=%v want 530", got)
|
||||
}
|
||||
@@ -81,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
}
|
||||
|
||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
|
||||
if got != 0 {
|
||||
t.Fatalf("expired cache returned %v want 0", got)
|
||||
}
|
||||
|
||||
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 2 {
|
||||
|
||||
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return restartNvidiaDrivers()
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
|
||||
@@ -25,6 +25,9 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sensor"}, File: "ipmitool-sensor.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "list"}, File: "ipmitool-sel.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "time", "get"}, File: "ipmitool-sel-time.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
|
||||
@@ -9,6 +9,17 @@ type LiveBootSource struct {
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type LiveMediaRAMState struct {
|
||||
LiveBootSource
|
||||
State string `json:"state"`
|
||||
Status string `json:"status"`
|
||||
ToramActive bool `json:"toram_active,omitempty"`
|
||||
CopyPresent bool `json:"copy_present,omitempty"`
|
||||
CopyComplete bool `json:"copy_complete,omitempty"`
|
||||
CanStartCopy bool `json:"can_start_copy,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
||||
package schema
|
||||
|
||||
import "encoding/json"
|
||||
|
||||
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||
type HardwareIngestRequest struct {
|
||||
@@ -15,17 +17,17 @@ type HardwareIngestRequest struct {
|
||||
}
|
||||
|
||||
type RuntimeHealth struct {
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||
@@ -64,8 +66,10 @@ type HardwareSnapshot struct {
|
||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
PlatformConfig *json.RawMessage `json:"platform_config,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
@@ -122,7 +126,7 @@ type HardwareCPU struct {
|
||||
type HardwareMemory struct {
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Location *string `json:"-"` // internal: used for DIMM telemetry matching only
|
||||
Present *bool `json:"present,omitempty"`
|
||||
SizeMB *int `json:"size_mb,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
@@ -143,30 +147,33 @@ type HardwareMemory struct {
|
||||
|
||||
type HardwareStorage struct {
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
LogicalBlockSizeBytes *int64 `json:"logical_block_size_bytes,omitempty"`
|
||||
PhysicalBlockSizeBytes *int64 `json:"physical_block_size_bytes,omitempty"`
|
||||
MetadataBytesPerBlock *int64 `json:"metadata_bytes_per_block,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
type HardwarePCIeDevice struct {
|
||||
@@ -211,6 +218,7 @@ type HardwarePCIeDevice struct {
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
MacAddresses []string `json:"mac_addresses,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
IOMMUGroup *int `json:"iommu_group,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
@@ -256,15 +264,13 @@ type HardwareSensors struct {
|
||||
}
|
||||
|
||||
type HardwareFanSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
Name string `json:"name"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwarePowerSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||
CurrentA *float64 `json:"current_a,omitempty"`
|
||||
PowerW *float64 `json:"power_w,omitempty"`
|
||||
@@ -273,7 +279,6 @@ type HardwarePowerSensor struct {
|
||||
|
||||
type HardwareTemperatureSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Celsius *float64 `json:"celsius,omitempty"`
|
||||
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||
@@ -281,11 +286,10 @@ type HardwareTemperatureSensor struct {
|
||||
}
|
||||
|
||||
type HardwareOtherSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
Name string `json:"name"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareEventLog struct {
|
||||
|
||||
@@ -44,3 +44,57 @@ func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
|
||||
t.Fatalf("missing event_logs payload: %s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
|
||||
powerOnHours := int64(12450)
|
||||
writtenBytes := int64(9876543210)
|
||||
readBytes := int64(1234567890)
|
||||
lifeRemainingPct := 91.0
|
||||
logicalBlockSizeBytes := int64(512)
|
||||
physicalBlockSizeBytes := int64(4096)
|
||||
metadataBytesPerBlock := int64(8)
|
||||
|
||||
payload := HardwareIngestRequest{
|
||||
CollectedAt: "2026-03-15T15:00:00Z",
|
||||
Hardware: HardwareSnapshot{
|
||||
Board: HardwareBoard{SerialNumber: "SRV-001"},
|
||||
Storage: []HardwareStorage{
|
||||
{
|
||||
SerialNumber: stringPtr("DISK-001"),
|
||||
Model: stringPtr("TestDisk"),
|
||||
LogicalBlockSizeBytes: &logicalBlockSizeBytes,
|
||||
PhysicalBlockSizeBytes: &physicalBlockSizeBytes,
|
||||
MetadataBytesPerBlock: &metadataBytesPerBlock,
|
||||
PowerOnHours: &powerOnHours,
|
||||
WrittenBytes: &writtenBytes,
|
||||
ReadBytes: &readBytes,
|
||||
LifeRemainingPct: &lifeRemainingPct,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
text := string(data)
|
||||
for _, needle := range []string{
|
||||
`"storage":[{`,
|
||||
`"logical_block_size_bytes":512`,
|
||||
`"physical_block_size_bytes":4096`,
|
||||
`"metadata_bytes_per_block":8`,
|
||||
`"power_on_hours":12450`,
|
||||
`"written_bytes":9876543210`,
|
||||
`"read_bytes":1234567890`,
|
||||
`"life_remaining_pct":91`,
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("missing %q in payload: %s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func stringPtr(v string) *string {
|
||||
return &v
|
||||
}
|
||||
|
||||
@@ -110,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||
|
||||
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
|
||||
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||
"nvidia-bandwidth", "nvidia-stress":
|
||||
return true
|
||||
@@ -125,9 +125,11 @@ func defaultTaskPriority(target string, params taskParams) int {
|
||||
return taskPriorityInstall
|
||||
case "install-to-ram":
|
||||
return taskPriorityInstallToRAM
|
||||
case "nvme-format":
|
||||
return taskPriorityInstall
|
||||
case "audit":
|
||||
return taskPriorityAudit
|
||||
case "nvidia-benchmark":
|
||||
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||
return taskPriorityBenchmark
|
||||
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
||||
return taskPriorityBurn
|
||||
@@ -526,14 +528,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||
ParallelGPUs bool `json:"parallel_gpus"`
|
||||
Loader string `json:"loader"`
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||
ParallelGPUs bool `json:"parallel_gpus"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
PlatformComponents []string `json:"platform_components"`
|
||||
@@ -549,14 +551,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
StaggerGPUStart: body.StaggerGPUStart,
|
||||
ParallelGPUs: body.ParallelGPUs,
|
||||
Loader: body.Loader,
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
StaggerGPUStart: body.StaggerGPUStart,
|
||||
ParallelGPUs: body.ParallelGPUs,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
@@ -573,131 +575,208 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||
RampUp *bool `json:"ramp_up"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
runNCCL := true
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
parallelGPUs := false
|
||||
if body.ParallelGPUs != nil {
|
||||
parallelGPUs = *body.ParallelGPUs
|
||||
}
|
||||
rampUp := false
|
||||
if body.RampUp != nil {
|
||||
rampUp = *body.RampUp
|
||||
}
|
||||
// Build a descriptive base name that includes profile and mode so the task
|
||||
// list is self-explanatory without opening individual task detail pages.
|
||||
profile := strings.TrimSpace(body.Profile)
|
||||
if profile == "" {
|
||||
profile = "standard"
|
||||
}
|
||||
name := taskDisplayName(target, "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
// Append profile tag.
|
||||
name = fmt.Sprintf("%s · %s", name, profile)
|
||||
|
||||
if target == "nvidia-bench-power" && parallelGPUs {
|
||||
writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
|
||||
return
|
||||
}
|
||||
|
||||
if rampUp && len(body.GPUIndices) > 1 {
|
||||
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
|
||||
// in Phase 2 (one additional GPU per step). A single task with all
|
||||
// selected GPUs is sufficient — spawning N tasks with growing subsets
|
||||
// would repeat all earlier steps redundantly.
|
||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
if len(resolved) < 2 {
|
||||
// Fall through to normal single-task path.
|
||||
rampUp = false
|
||||
} else {
|
||||
now := time.Now()
|
||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||
taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
|
||||
t := &Task{
|
||||
ID: newJobID("bee-bench-nvidia"),
|
||||
Name: taskName,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
GPUIndices: append([]int(nil), resolved...),
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: true,
|
||||
RampTotal: len(resolved),
|
||||
RampRunID: rampRunID,
|
||||
DisplayName: taskName,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeTaskRunResponse(w, []*Task{t})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// For non-ramp tasks append mode tag.
|
||||
if parallelGPUs {
|
||||
name = fmt.Sprintf("%s · parallel", name)
|
||||
} else {
|
||||
name = fmt.Sprintf("%s · sequential", name)
|
||||
}
|
||||
|
||||
params := taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: parallelGPUs,
|
||||
DisplayName: body.DisplayName,
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
BenchmarkKind string `json:"benchmark_kind"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
profile := strings.TrimSpace(body.Profile)
|
||||
if profile == "" {
|
||||
profile = "standard"
|
||||
}
|
||||
benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
|
||||
if benchmarkKind == "" {
|
||||
benchmarkKind = "power-fit"
|
||||
}
|
||||
now := time.Now()
|
||||
taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
|
||||
t := &Task{
|
||||
ID: newJobID("bee-bench-autotune"),
|
||||
Name: taskName,
|
||||
Target: "nvidia-bench-autotune",
|
||||
Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
BenchmarkProfile: profile,
|
||||
BenchmarkKind: benchmarkKind,
|
||||
SizeMB: body.SizeMB,
|
||||
DisplayName: taskName,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeTaskRunResponse(w, []*Task{t})
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||
RampUp *bool `json:"ramp_up"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
runNCCL := true
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
parallelGPUs := false
|
||||
if body.ParallelGPUs != nil {
|
||||
parallelGPUs = *body.ParallelGPUs
|
||||
}
|
||||
rampUp := false
|
||||
if body.RampUp != nil {
|
||||
rampUp = *body.RampUp
|
||||
}
|
||||
// Build a descriptive base name that includes profile and mode so the task
|
||||
// list is self-explanatory without opening individual task detail pages.
|
||||
profile := strings.TrimSpace(body.Profile)
|
||||
if profile == "" {
|
||||
profile = "standard"
|
||||
}
|
||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
// Append profile tag.
|
||||
name = fmt.Sprintf("%s · %s", name, profile)
|
||||
|
||||
if rampUp && len(body.GPUIndices) > 1 {
|
||||
// Ramp-up mode: resolve GPU list, then create one task per prefix
|
||||
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
|
||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
if len(resolved) < 2 {
|
||||
// Fall through to normal single-task path.
|
||||
rampUp = false
|
||||
} else {
|
||||
now := time.Now()
|
||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||
var allTasks []*Task
|
||||
for step := 1; step <= len(resolved); step++ {
|
||||
subset := resolved[:step]
|
||||
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
|
||||
t := &Task{
|
||||
ID: newJobID("benchmark-nvidia"),
|
||||
Name: stepName,
|
||||
Target: "nvidia-benchmark",
|
||||
Priority: defaultTaskPriority("nvidia-benchmark", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
GPUIndices: append([]int(nil), subset...),
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL && step == len(resolved),
|
||||
ParallelGPUs: true,
|
||||
RampStep: step,
|
||||
RampTotal: len(resolved),
|
||||
RampRunID: rampRunID,
|
||||
DisplayName: stepName,
|
||||
},
|
||||
}
|
||||
allTasks = append(allTasks, t)
|
||||
}
|
||||
for _, t := range allTasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, allTasks)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// For non-ramp tasks append mode tag.
|
||||
if parallelGPUs {
|
||||
name = fmt.Sprintf("%s · parallel", name)
|
||||
} else {
|
||||
name = fmt.Sprintf("%s · sequential", name)
|
||||
}
|
||||
|
||||
params := taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: parallelGPUs,
|
||||
DisplayName: body.DisplayName,
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia")
|
||||
cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
if os.IsNotExist(err) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
writeJSON(w, map[string]any{
|
||||
"configured": false,
|
||||
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||
})
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
writeJSON(w, map[string]any{
|
||||
"configured": true,
|
||||
"config": cfg,
|
||||
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -729,12 +808,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
if t.job == nil || !t.job.abort() {
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "not_running"})
|
||||
return
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborting"})
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
@@ -959,6 +1040,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
|
||||
state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if state.Targets == nil {
|
||||
state.Targets = []app.BlackboxTargetStatus{}
|
||||
}
|
||||
writeJSON(w, state)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var target platform.RemovableTarget
|
||||
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
targets, err := h.opts.App.ListRemovableTargets()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
allowed := false
|
||||
for _, candidate := range targets {
|
||||
if candidate.Device == target.Device {
|
||||
target = candidate
|
||||
allowed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "device not in removable target list")
|
||||
return
|
||||
}
|
||||
marker, err := app.EnableBlackboxTarget(target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"status": "ok",
|
||||
"message": "Black-box marker written.",
|
||||
"enrollment_id": marker.EnrollmentID,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
writeError(w, http.StatusNotFound, "black-box target not found")
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
|
||||
}
|
||||
|
||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||
@@ -1072,18 +1228,55 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
status := h.opts.App.LiveBootSource()
|
||||
status := h.currentRAMStatus()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(status)
|
||||
}
|
||||
|
||||
type ramStatusResponse struct {
|
||||
platform.LiveMediaRAMState
|
||||
InstallTaskActive bool `json:"install_task_active,omitempty"`
|
||||
CopyTaskActive bool `json:"copy_task_active,omitempty"`
|
||||
CanStartTask bool `json:"can_start_task,omitempty"`
|
||||
BlockedReason string `json:"blocked_reason,omitempty"`
|
||||
}
|
||||
|
||||
func (h *handler) currentRAMStatus() ramStatusResponse {
|
||||
state := h.opts.App.LiveMediaRAMState()
|
||||
resp := ramStatusResponse{LiveMediaRAMState: state}
|
||||
if globalQueue.hasActiveTarget("install") {
|
||||
resp.InstallTaskActive = true
|
||||
resp.BlockedReason = "install to disk is already running"
|
||||
return resp
|
||||
}
|
||||
if globalQueue.hasActiveTarget("install-to-ram") {
|
||||
resp.CopyTaskActive = true
|
||||
resp.BlockedReason = "install to RAM task is already pending or running"
|
||||
return resp
|
||||
}
|
||||
if state.InRAM {
|
||||
resp.BlockedReason = "system is already running from RAM"
|
||||
return resp
|
||||
}
|
||||
resp.CanStartTask = state.CanStartCopy
|
||||
if !resp.CanStartTask && resp.BlockedReason == "" {
|
||||
resp.BlockedReason = state.Message
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
if globalQueue.hasActiveTarget("install") {
|
||||
writeError(w, http.StatusConflict, "install to disk is already running")
|
||||
status := h.currentRAMStatus()
|
||||
if !status.CanStartTask {
|
||||
msg := strings.TrimSpace(status.BlockedReason)
|
||||
if msg == "" {
|
||||
msg = "install to RAM is not available"
|
||||
}
|
||||
writeError(w, http.StatusConflict, msg)
|
||||
return
|
||||
}
|
||||
t := &Task{
|
||||
@@ -1099,12 +1292,28 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISystemReboot(w http.ResponseWriter, r *http.Request) {
|
||||
if err := exec.Command("systemctl", "reboot").Start(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "reboot failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "rebooting"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISystemShutdown(w http.ResponseWriter, r *http.Request) {
|
||||
if err := exec.Command("systemctl", "poweroff").Start(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "shutdown failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "shutting down"})
|
||||
}
|
||||
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint", "qrencode",
|
||||
"mstflint", "saa",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -1481,6 +1690,61 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
|
||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||
}
|
||||
|
||||
// ── Hardware summary / component detail ──────────────────────────────────────
|
||||
|
||||
// handleAPIHardwareSummary returns the hardware summary card HTML fragment for
|
||||
// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML").
|
||||
func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, renderHardwareSummaryCard(h.opts))
|
||||
}
|
||||
|
||||
// handleAPIComponentDetail returns an HTML fragment describing the current and
|
||||
// historical status for one component type (cpu, memory, storage, gpu, psu).
|
||||
func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) {
|
||||
compType := r.PathValue("type")
|
||||
var exact, prefixes []string
|
||||
var title string
|
||||
switch compType {
|
||||
case "cpu":
|
||||
title = "CPU"
|
||||
exact = []string{"cpu:all"}
|
||||
case "memory":
|
||||
title = "Memory"
|
||||
exact = []string{"memory:all"}
|
||||
prefixes = []string{"memory:"}
|
||||
case "storage":
|
||||
title = "Storage"
|
||||
exact = []string{"storage:all"}
|
||||
prefixes = []string{"storage:"}
|
||||
case "gpu":
|
||||
title = "GPU"
|
||||
prefixes = []string{"pcie:gpu:"}
|
||||
case "psu":
|
||||
title = "PSU"
|
||||
prefixes = []string{"psu:"}
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
var records []app.ComponentStatusRecord
|
||||
if h.opts.App != nil && h.opts.App.StatusDB != nil {
|
||||
all := h.opts.App.StatusDB.All()
|
||||
records = matchedRecords(all, exact, prefixes)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, renderComponentDetail(title, records))
|
||||
}
|
||||
|
||||
func (h *handler) rollbackPendingNetworkChange() error {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
|
||||
@@ -3,6 +3,8 @@ package webui
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
@@ -44,6 +46,66 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
|
||||
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||
|
||||
h.handleAPIBlackboxStatus(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var state app.BlackboxState
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
|
||||
t.Fatalf("decode state: %v", err)
|
||||
}
|
||||
if state.Status != "disabled" {
|
||||
t.Fatalf("status=%q want disabled", state.Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
||||
exportDir := t.TempDir()
|
||||
statePath := filepath.Join(exportDir, "blackbox-state.json")
|
||||
if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write state: %v", err)
|
||||
}
|
||||
h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||
|
||||
h.handleAPIBlackboxStatus(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
|
||||
t.Fatalf("body=%s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeFormatModes(t *testing.T) {
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
lbaf 2 : ms:0 lbads:12 rp:0
|
||||
`
|
||||
modes := parseNVMeFormatModes(raw)
|
||||
if len(modes) != 3 {
|
||||
t.Fatalf("modes=%#v want 3 modes", modes)
|
||||
}
|
||||
if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
|
||||
t.Fatalf("mode 0=%#v", modes[0])
|
||||
}
|
||||
if modes[1].Label != "MODE 1 (512+8)" {
|
||||
t.Fatalf("mode 1 label=%q", modes[1].Label)
|
||||
}
|
||||
if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
|
||||
t.Fatalf("mode 2=%#v", modes[2])
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
@@ -64,7 +126,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
@@ -78,8 +140,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-benchmark" {
|
||||
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
|
||||
if task.Target != "nvidia-bench-perf" {
|
||||
t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
|
||||
}
|
||||
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||
@@ -113,7 +175,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
@@ -147,6 +209,88 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
|
||||
// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-power" {
|
||||
t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
if task.params.RampTotal != 3 {
|
||||
t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-autotune" {
|
||||
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
|
||||
}
|
||||
if task.params.BenchmarkKind != "power-fit" {
|
||||
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
@@ -202,7 +346,8 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||
defaultTaskPriority("cpu", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||
defaultTaskPriority("nvidia-benchmark", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-perf", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-power", taskParams{}),
|
||||
}
|
||||
want := []int{
|
||||
taskPriorityInstallToRAM,
|
||||
@@ -211,13 +356,14 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||
taskPriorityValidateStress,
|
||||
taskPriorityBurn,
|
||||
taskPriorityBenchmark,
|
||||
taskPriorityBenchmark,
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) {
|
||||
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
|
||||
t.Fatalf("priority order=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||
return out
|
||||
}
|
||||
|
||||
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||
// power charts where the filled area of each PSU shows its individual
|
||||
// contribution and the total height equals the combined draw.
|
||||
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range datasets {
|
||||
if len(datasets[i]) == 0 {
|
||||
datasets[i] = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
// Build cumulative sums per time point.
|
||||
cumulative := make([][]float64, len(datasets)+1)
|
||||
for i := range cumulative {
|
||||
cumulative[i] = make([]float64, pointCount)
|
||||
}
|
||||
for i, ds := range datasets {
|
||||
for j, v := range ds {
|
||||
cumulative[i+1][j] = cumulative[i][j] + v
|
||||
}
|
||||
}
|
||||
|
||||
// Scale is based on the total (top cumulative row).
|
||||
total := cumulative[len(cumulative)-1]
|
||||
yMin := floatPtr(0)
|
||||
if yMax == nil {
|
||||
yMax = autoMax120(total)
|
||||
}
|
||||
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||
|
||||
legendItems := make([]metricChartSeries, len(datasets))
|
||||
for i, name := range names {
|
||||
color := metricChartPalette[i%len(metricChartPalette)]
|
||||
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||
}
|
||||
|
||||
// Stats label from totals.
|
||||
statsLabel := chartStatsLabel([][]float64{total})
|
||||
|
||||
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
writeSingleAxisY(&b, layout, scale)
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
|
||||
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
// Draw border polylines on top.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
|
||||
writeLegend(&b, layout, legendItems)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||
// (baseline and top), using the given color at 55% opacity.
|
||||
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||
n := len(top)
|
||||
if n == 0 {
|
||||
return
|
||||
}
|
||||
if len(baseline) < n {
|
||||
baseline = make([]float64, n)
|
||||
}
|
||||
|
||||
// Forward path along top values, then backward along baseline values.
|
||||
var points strings.Builder
|
||||
for i := 0; i < n; i++ {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
if i > 0 {
|
||||
points.WriteByte(' ')
|
||||
}
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
for i := n - 1; i >= 0; i-- {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
points.WriteByte(' ')
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||
}
|
||||
|
||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||
}
|
||||
|
||||
76
audit/internal/webui/health_poller.go
Normal file
76
audit/internal/webui/health_poller.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/collector"
|
||||
)
|
||||
|
||||
const healthPollInterval = 60 * time.Second
|
||||
const psuIPMITimeout = 15 * time.Second
|
||||
|
||||
// healthPoller runs periodic health checks for hardware components that do not
|
||||
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
|
||||
type healthPoller struct {
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
|
||||
return &healthPoller{statusDB: statusDB}
|
||||
}
|
||||
|
||||
func (p *healthPoller) start() {
|
||||
goRecoverLoop("health poller", 5*time.Second, p.run)
|
||||
}
|
||||
|
||||
func (p *healthPoller) run() {
|
||||
ticker := time.NewTicker(healthPollInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
p.pollPSU()
|
||||
}
|
||||
}
|
||||
|
||||
func (p *healthPoller) pollPSU() {
|
||||
if p.statusDB == nil {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
|
||||
var out bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
if err := cmd.Run(); err != nil {
|
||||
// IPMI not available or not a server — skip silently.
|
||||
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
slots := collector.PSUSlotsFromSDR(out.String())
|
||||
if len(slots) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
const source = "watchdog:psu"
|
||||
for slot, psu := range slots {
|
||||
key := "psu:" + slot
|
||||
status := psu.Status
|
||||
if status == "" {
|
||||
status = "Unknown"
|
||||
}
|
||||
detail := ""
|
||||
switch status {
|
||||
case "Critical":
|
||||
detail = "PSU sensor reported non-OK state"
|
||||
case "Warning":
|
||||
detail = "PSU sensor in warning state"
|
||||
}
|
||||
p.statusDB.Record(key, source, status, detail)
|
||||
}
|
||||
}
|
||||
280
audit/internal/webui/huawei_elabel.go
Normal file
280
audit/internal/webui/huawei_elabel.go
Normal file
@@ -0,0 +1,280 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type huaweiField struct {
|
||||
Name string `json:"name"`
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
ReadOnly bool `json:"read_only,omitempty"`
|
||||
}
|
||||
|
||||
type huaweiChange struct {
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type huaweiFieldDef struct {
|
||||
Name string
|
||||
Key string
|
||||
FruID byte
|
||||
TypeID byte
|
||||
FieldID byte
|
||||
Special string // "chassis-type" | "guid"
|
||||
}
|
||||
|
||||
var huaweiElabelDefs = []huaweiFieldDef{
|
||||
{"Device Name", "DeviceName", 0x00, 0x06, 0x01, ""},
|
||||
{"Device Serial Number", "DeviceSerialNumber", 0x00, 0x06, 0x03, ""},
|
||||
{"Product Name", "ProductName", 0x00, 0x03, 0x01, ""},
|
||||
{"Product Serial Number", "ProductSerialNumber", 0x00, 0x03, 0x04, ""},
|
||||
{"Product Asset Tag", "ProductAssetTag", 0x00, 0x03, 0x05, ""},
|
||||
{"Product Manufacturer", "ProductManufacturer", 0x00, 0x03, 0x00, ""},
|
||||
{"Mainboard Manufacturer", "MainboardManufacturer", 0x00, 0x02, 0x01, ""},
|
||||
{"Board Product Name", "BoardProductName", 0x00, 0x02, 0x02, ""},
|
||||
{"Chassis Part Number", "ChassisPartnumber", 0x00, 0x01, 0x01, ""},
|
||||
{"Chassis Type", "ChassisType", 0x00, 0x01, 0x00, "chassis-type"},
|
||||
{"IO Chassis Serial", "IOChassisSerialNumber", 0x01, 0x03, 0x04, ""},
|
||||
{"IO Chassis Asset Tag", "IOChassisAssetTag", 0x01, 0x03, 0x05, ""},
|
||||
{"GUID", "GUID", 0x00, 0x00, 0x00, "guid"},
|
||||
}
|
||||
|
||||
// huaweiGetRaw reads a string elabel field via OEM IPMI raw command.
|
||||
// Protocol: ipmitool raw 0x30 0x90 0x05 <fru_id> <type_id> <field_id> 0x00 0x30
|
||||
// Response: <length_byte> <ascii_byte1> ... (null-terminated)
|
||||
func huaweiGetRaw(ctx context.Context, def huaweiFieldDef) (string, error) {
|
||||
if def.Special == "guid" {
|
||||
return huaweiGetGUID(ctx)
|
||||
}
|
||||
args := []string{
|
||||
"0x30", "0x90", "0x05",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
"0x00", "0x30",
|
||||
}
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", append([]string{"raw"}, args...)...).CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return huaweiParseStringResponse(strings.TrimSpace(string(out)), def.Special), nil
|
||||
}
|
||||
|
||||
// huaweiParseStringResponse decodes the OEM IPMI response bytes to a string.
|
||||
// Format: <length_byte> <byte1> <byte2> ...
|
||||
func huaweiParseStringResponse(hexOut, special string) string {
|
||||
parts := strings.Fields(hexOut)
|
||||
if len(parts) < 2 {
|
||||
return ""
|
||||
}
|
||||
if special == "chassis-type" {
|
||||
// Response: <length=1> <type_byte>
|
||||
if len(parts) >= 2 {
|
||||
n, err := strconv.ParseUint(parts[1], 16, 8)
|
||||
if err == nil {
|
||||
return fmt.Sprintf("0x%02x", n)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
var sb strings.Builder
|
||||
for _, p := range parts[1:] {
|
||||
b, err := strconv.ParseUint(p, 16, 8)
|
||||
if err != nil || b == 0 {
|
||||
break
|
||||
}
|
||||
sb.WriteByte(byte(b))
|
||||
}
|
||||
return strings.TrimRight(sb.String(), "\x00")
|
||||
}
|
||||
|
||||
// huaweiGetGUID reads the system GUID via standard IPMI Get System GUID (0x06 0x08).
|
||||
func huaweiGetGUID(ctx context.Context) (string, error) {
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "raw", "0x06", "0x08").CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
parts := strings.Fields(strings.TrimSpace(string(out)))
|
||||
if len(parts) != 16 {
|
||||
return "", nil
|
||||
}
|
||||
// Format as UUID: 4-2-2-2-6 byte groups
|
||||
// iBMC returns bytes in reversed order; re-reverse to get canonical UUID.
|
||||
var bytes [16]string
|
||||
for i, p := range parts {
|
||||
bytes[15-i] = p
|
||||
}
|
||||
return fmt.Sprintf("%s%s%s%s-%s%s-%s%s-%s%s-%s%s%s%s%s%s",
|
||||
bytes[0], bytes[1], bytes[2], bytes[3],
|
||||
bytes[4], bytes[5],
|
||||
bytes[6], bytes[7],
|
||||
bytes[8], bytes[9],
|
||||
bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
|
||||
), nil
|
||||
}
|
||||
|
||||
// huaweiChunks splits a value into 19-byte chunks for the OEM IPMI SET protocol.
|
||||
// Key byte: bit7=1 means more chunks follow; bits 0-6 = offset into string.
|
||||
func huaweiChunks(value string) [][]string {
|
||||
if len(value) == 0 {
|
||||
return [][]string{{"0x00", "0x01", "0x00"}}
|
||||
}
|
||||
const maxLen = 63
|
||||
if len(value) > maxLen {
|
||||
value = value[:maxLen]
|
||||
}
|
||||
const chunkSize = 19
|
||||
var chunks [][]string
|
||||
for offset := 0; offset < len(value); {
|
||||
end := offset + chunkSize
|
||||
if end > len(value) {
|
||||
end = len(value)
|
||||
}
|
||||
isLast := end >= len(value)
|
||||
key := byte(offset)
|
||||
if !isLast {
|
||||
key |= 0x80
|
||||
}
|
||||
args := []string{
|
||||
fmt.Sprintf("0x%02x", key),
|
||||
fmt.Sprintf("0x%02x", end-offset),
|
||||
}
|
||||
for _, b := range []byte(value[offset:end]) {
|
||||
args = append(args, fmt.Sprintf("0x%02x", b))
|
||||
}
|
||||
chunks = append(chunks, args)
|
||||
offset = end
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelRead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var fields []huaweiField
|
||||
for _, def := range huaweiElabelDefs {
|
||||
val, err := huaweiGetRaw(ctx, def)
|
||||
if err != nil {
|
||||
// First field failure likely means no Huawei BMC — abort with error.
|
||||
if len(fields) == 0 {
|
||||
msg := strings.TrimSpace(err.Error())
|
||||
writeError(w, http.StatusInternalServerError, "huawei elabel not available: "+msg)
|
||||
return
|
||||
}
|
||||
val = ""
|
||||
}
|
||||
fields = append(fields, huaweiField{
|
||||
Name: def.Name,
|
||||
Key: def.Key,
|
||||
Value: val,
|
||||
ReadOnly: def.Special == "guid" || def.Special == "chassis-type",
|
||||
})
|
||||
}
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []huaweiChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
for _, c := range req.Changes {
|
||||
def, ok := defByKey[c.Key]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "unknown field key: "+c.Key)
|
||||
return
|
||||
}
|
||||
if def.Special == "guid" || def.Special == "chassis-type" {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field is read-only: "+c.Key)
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 63 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 63 chars): "+c.Key)
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch < 0x20 || ch > 0x7E {
|
||||
writeError(w, http.StatusUnprocessableEntity, "non-printable character in value for: "+c.Key)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("huawei-elabel-write"),
|
||||
Name: fmt.Sprintf("Huawei Elabel Write (%d field(s))", len(req.Changes)),
|
||||
Target: "huawei-elabel-write",
|
||||
Priority: defaultTaskPriority("huawei-elabel-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{HuaweiElabelChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runHuaweiElabelWriteTask(ctx context.Context, j *jobState, p taskParams) error {
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
// Enable device name effective flag before writing.
|
||||
enableCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x21", "0x04", "0x01")
|
||||
if out, err := enableCmd.CombinedOutput(); err != nil {
|
||||
j.append("Warning: enable flag: " + strings.TrimSpace(string(out)))
|
||||
}
|
||||
|
||||
for _, c := range p.HuaweiElabelChanges {
|
||||
def := defByKey[c.Key]
|
||||
setPrefix := []string{
|
||||
"0x30", "0x90", "0x04",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
}
|
||||
|
||||
chunks := huaweiChunks(c.Value)
|
||||
j.append(fmt.Sprintf("Setting %s = %q (%d chunk(s))", c.Key, c.Value, len(chunks)))
|
||||
|
||||
for _, chunk := range chunks {
|
||||
args := append([]string{"raw"}, setPrefix...)
|
||||
args = append(args, chunk...)
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", args...)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("set %s: %w", c.Key, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Commit after each field.
|
||||
commitCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x06", "0x00", "0xAA")
|
||||
if out, err := commitCmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("commit after %s: %w (output: %s)", c.Key, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
j.append("Committed " + c.Key)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
204
audit/internal/webui/ipmi_fru.go
Normal file
204
audit/internal/webui/ipmi_fru.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
type fruField struct {
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
Editable bool `json:"editable"`
|
||||
Area string `json:"area,omitempty"`
|
||||
Index int `json:"index,omitempty"`
|
||||
}
|
||||
|
||||
type fruChange struct {
|
||||
Area string `json:"area"`
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
// fruEditableFields maps display name → area + index for ipmitool fru edit.
|
||||
var fruEditableFields = map[string]struct {
|
||||
Area string
|
||||
Index int
|
||||
}{
|
||||
// Chassis — vendor doc names and ipmitool abbreviated names
|
||||
"Chassis Part Number": {"c", 0},
|
||||
"Chassis Serial Number": {"c", 1},
|
||||
"Chassis Serial": {"c", 1},
|
||||
"Chassis Extra": {"c", 2},
|
||||
// Board — vendor doc names and ipmitool abbreviated names
|
||||
"Board Manufacturer": {"b", 0},
|
||||
"Board Mfg": {"b", 0},
|
||||
"Board Product Name": {"b", 1},
|
||||
"Board Product": {"b", 1},
|
||||
"Board Serial Number": {"b", 2},
|
||||
"Board Serial": {"b", 2},
|
||||
"Board Part Number": {"b", 3},
|
||||
// Product — vendor doc names and ipmitool abbreviated names
|
||||
"Product Manufacturer": {"p", 0},
|
||||
"Product Name": {"p", 1},
|
||||
"Product Part Number": {"p", 2},
|
||||
"Product Version": {"p", 3},
|
||||
"Product Serial Number": {"p", 4},
|
||||
"Product Serial": {"p", 4},
|
||||
}
|
||||
|
||||
func parseFRUOutput(output string) []fruField {
|
||||
var fields []fruField
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
// Lines look like: " Field Name : value"
|
||||
trimmed := strings.TrimLeft(line, " \t")
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
colon := strings.Index(trimmed, " : ")
|
||||
if colon < 0 {
|
||||
// try ": " with no leading space before colon
|
||||
colon = strings.Index(trimmed, ": ")
|
||||
if colon < 0 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(trimmed[:colon])
|
||||
value := strings.TrimSpace(trimmed[colon+2:])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
editable, area, idx := fruFieldMeta(name)
|
||||
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(trimmed[:colon])
|
||||
value := strings.TrimSpace(trimmed[colon+3:])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
editable, area, idx := fruFieldMeta(name)
|
||||
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func fruFieldMeta(name string) (editable bool, area string, index int) {
|
||||
if e, ok := fruEditableFields[name]; ok {
|
||||
return true, e.Area, e.Index
|
||||
}
|
||||
// All fields are shown as editable; server will reject unknown fields.
|
||||
return true, "", 0
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIIPMIFRURead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, "ipmitool fru print: "+msg)
|
||||
return
|
||||
}
|
||||
|
||||
fields := parseFRUOutput(string(out))
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIIPMIFRUWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []fruChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
validAreas := map[string]bool{"c": true, "b": true, "p": true}
|
||||
for i, c := range req.Changes {
|
||||
if c.Area == "" {
|
||||
e, ok := fruEditableFields[c.Name]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field not writable via ipmitool: "+c.Name)
|
||||
return
|
||||
}
|
||||
req.Changes[i].Area = e.Area
|
||||
req.Changes[i].Index = e.Index
|
||||
c = req.Changes[i]
|
||||
}
|
||||
if !validAreas[c.Area] {
|
||||
writeError(w, http.StatusUnprocessableEntity, "invalid area: "+c.Area)
|
||||
return
|
||||
}
|
||||
if c.Index < 0 || c.Index > 9 {
|
||||
writeError(w, http.StatusUnprocessableEntity, fmt.Sprintf("invalid index %d", c.Index))
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 64 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 64 chars)")
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch > unicode.MaxASCII || (ch < 0x20 && ch != 0) {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable characters")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("ipmi-fru-write"),
|
||||
Name: fmt.Sprintf("IPMI FRU Write (%d field(s))", len(req.Changes)),
|
||||
Target: "ipmi-fru-write",
|
||||
Priority: defaultTaskPriority("ipmi-fru-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{FRUChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runIPMIFRUWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||
// Backup current FRU state
|
||||
backupDir := filepath.Join(exportDir, "fru-backups")
|
||||
if err := os.MkdirAll(backupDir, 0755); err != nil {
|
||||
return fmt.Errorf("mkdir fru-backups: %w", err)
|
||||
}
|
||||
stamp := time.Now().Format("20060102150405")
|
||||
backupPath := filepath.Join(backupDir, "fru-"+stamp+".txt")
|
||||
|
||||
backupOut, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("backup fru print: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(backupPath, backupOut, 0644); err != nil {
|
||||
return fmt.Errorf("write backup: %w", err)
|
||||
}
|
||||
j.append("Backup saved to " + backupPath)
|
||||
|
||||
// Apply changes
|
||||
for _, c := range p.FRUChanges {
|
||||
j.append(fmt.Sprintf("Setting %s (%s %d) = %q", c.Name, c.Area, c.Index, c.Value))
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "edit", "0", "field", c.Area, fmt.Sprintf("%d", c.Index), c.Value)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("fru edit %s %d: %w", c.Area, c.Index, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -17,6 +20,25 @@ type jobState struct {
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logBuf *bufio.Writer
|
||||
}
|
||||
|
||||
// readTaskLogFile reads a task log, refusing files over 50 MB.
|
||||
func readTaskLogFile(path string) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(len(data)) > 50<<20 {
|
||||
return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// abort cancels the job if it has a cancel function and is not yet done.
|
||||
@@ -31,13 +53,21 @@ func (j *jobState) abort() bool {
|
||||
}
|
||||
|
||||
func (j *jobState) append(line string) {
|
||||
j.appendWithOptions(line, true, true)
|
||||
}
|
||||
|
||||
func (j *jobState) appendFromLog(line string) {
|
||||
j.appendWithOptions(line, false, false)
|
||||
}
|
||||
|
||||
func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
appendJobLog(j.logPath, line)
|
||||
if persistLog && j.logPath != "" {
|
||||
j.writeLogLineLocked(line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
if serialMirror && j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
@@ -48,6 +78,36 @@ func (j *jobState) append(line string) {
|
||||
}
|
||||
}
|
||||
|
||||
// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
|
||||
// Must be called with j.mu held. Uses a buffered writer kept open for the task
|
||||
// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
|
||||
func (j *jobState) writeLogLineLocked(line string) {
|
||||
if j.logFile == nil {
|
||||
f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
j.logFile = f
|
||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||
}
|
||||
_, _ = j.logBuf.WriteString(line + "\n")
|
||||
_ = j.logBuf.Flush()
|
||||
}
|
||||
|
||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||
func (j *jobState) closeLog() {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
if j.logBuf != nil {
|
||||
_ = j.logBuf.Flush()
|
||||
}
|
||||
if j.logFile != nil {
|
||||
_ = j.logFile.Close()
|
||||
j.logFile = nil
|
||||
j.logBuf = nil
|
||||
}
|
||||
}
|
||||
|
||||
func (j *jobState) finish(errMsg string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
@@ -119,7 +179,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||
if logPath == "" {
|
||||
return j
|
||||
}
|
||||
data, err := os.ReadFile(logPath)
|
||||
data, err := readTaskLogFile(logPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return j
|
||||
}
|
||||
|
||||
@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
} else {
|
||||
evtCopy := evt
|
||||
goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
@@ -162,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
case "gpu":
|
||||
key = "pcie:gpu:" + normalizeBDF(id)
|
||||
case "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
@@ -180,6 +185,54 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
}
|
||||
}
|
||||
|
||||
// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
|
||||
// Called when an error is detected outside of any SAT task (always-on watching).
|
||||
func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
const source = "watchdog:kmsg"
|
||||
detail := "kernel: " + truncate(evt.raw, 120)
|
||||
|
||||
var severity string
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
if p.Re.MatchString(evt.raw) {
|
||||
if p.Severity == "critical" {
|
||||
severity = "Critical"
|
||||
} else {
|
||||
severity = "Warning"
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if severity == "" {
|
||||
severity = "Warning"
|
||||
}
|
||||
|
||||
if len(evt.ids) == 0 {
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
w.statusDB.Record(key, source, severity, detail)
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu":
|
||||
key = "pcie:gpu:" + normalizeBDF(id)
|
||||
case "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
w.statusDB.Record(key, source, severity, detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
@@ -232,7 +285,7 @@ func truncate(s string, max int) string {
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
|
||||
154
audit/internal/webui/layout.go
Normal file
154
audit/internal/webui/layout.go
Normal file
@@ -0,0 +1,154 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func layoutHead(title string) string {
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>` + html.EscapeString(title) + `</title>
|
||||
<style>
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||
*{box-sizing:border-box;margin:0;padding:0}
|
||||
dialog{margin:auto}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||
a{color:var(--accent);text-decoration:none}
|
||||
/* Sidebar */
|
||||
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
||||
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
|
||||
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
|
||||
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
|
||||
.nav{flex:1}
|
||||
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||
.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
|
||||
/* Content */
|
||||
.main{flex:1;display:flex;flex-direction:column;overflow:auto}
|
||||
.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
|
||||
.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
|
||||
.content{padding:24px;flex:1}
|
||||
/* Cards */
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
|
||||
.card-head-actions{justify-content:space-between}
|
||||
.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
|
||||
.card-body{padding:16px}
|
||||
/* Buttons */
|
||||
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
|
||||
.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
|
||||
.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
|
||||
.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
|
||||
.btn-sm{padding:5px 10px;font-size:12px}
|
||||
/* Tables */
|
||||
table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
|
||||
th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
|
||||
td{padding:9px 14px;border-top:1px solid var(--border-lite)}
|
||||
tr:first-child td{border-top:0}
|
||||
tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
/* Status badges */
|
||||
.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
|
||||
.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Component chips — one small square per device */
|
||||
.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
|
||||
.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
|
||||
.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Nav separator and tasks count badge */
|
||||
.nav-sep{height:1px;background:rgba(255,255,255,.12);margin:6px 0}
|
||||
.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none;margin-left:auto}
|
||||
.tasks-nav-count.active{display:inline}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||
/* Forms */
|
||||
.form-row{margin-bottom:14px}
|
||||
.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
|
||||
.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
|
||||
.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
|
||||
/* Grid */
|
||||
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
|
||||
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
|
||||
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
|
||||
/* iframe viewer */
|
||||
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
|
||||
/* Alerts */
|
||||
.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
|
||||
.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
|
||||
.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
`
|
||||
}
|
||||
|
||||
func layoutNav(active string, buildLabel string) string {
|
||||
type navItem struct {
|
||||
id, label, href string
|
||||
sep bool
|
||||
}
|
||||
items := []navItem{
|
||||
{id: "dashboard", label: "Dashboard", href: "/"},
|
||||
{id: "audit", label: "1. Audit", href: "/audit"},
|
||||
{id: "check", label: "2. Check", href: "/check"},
|
||||
{id: "load", label: "3. Load", href: "/load"},
|
||||
{id: "burn", label: "4. Burn", href: "/burn"},
|
||||
{id: "benchmark", label: "5. Benchmark", href: "/benchmark"},
|
||||
{sep: true},
|
||||
{id: "tasks", label: "Tasks", href: "/tasks"},
|
||||
{id: "tools", label: "Tools", href: "/tools"},
|
||||
{id: "settings", label: "Settings", href: "/settings"},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||
if strings.TrimSpace(buildLabel) == "" {
|
||||
buildLabel = "dev"
|
||||
}
|
||||
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||
gspMode := strings.TrimSpace(string(raw))
|
||||
switch gspMode {
|
||||
case "gsp-off":
|
||||
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
|
||||
case "gsp-stuck":
|
||||
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
|
||||
}
|
||||
}
|
||||
b.WriteString(`<nav class="nav">`)
|
||||
for _, item := range items {
|
||||
if item.sep {
|
||||
b.WriteString(`<div class="nav-sep"></div>`)
|
||||
continue
|
||||
}
|
||||
cls := "nav-item"
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
if item.id == "tasks" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" id="tasks-nav-item">%s<span class="tasks-nav-count" id="tasks-nav-count"></span></a>`, cls, item.href, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`, cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`<script>`)
|
||||
b.WriteString(`(function(){function u(){fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(d){var n=Array.isArray(d)?d.filter(function(t){return t.status==='pending'||t.status==='running';}).length:0;var c=document.getElementById('tasks-nav-count');var el=document.getElementById('tasks-nav-item');if(c){c.textContent=n>0?String(n):'';c.className='tasks-nav-count'+(n>0?' active':'');}if(el){el.style.color=n>0?'#f6c90e':'';}}).catch(function(){});}u();setInterval(u,5000);})();`)
|
||||
b.WriteString(`</script>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
}
|
||||
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
power_source TEXT,
|
||||
power_mode TEXT,
|
||||
power_reason TEXT,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
|
||||
}
|
||||
|
||||
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -161,14 +173,64 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// Downsample reduces density of old metrics rows to 1 sample per minute.
|
||||
// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
|
||||
// affected — rows newer than downsampleBefore keep full 5-second resolution.
|
||||
// For each 60-second bucket the row with the smallest ts is kept; the rest
|
||||
// are deleted. This trims ~92 % of rows in that window while preserving
|
||||
// the overall shape of every chart.
|
||||
//
|
||||
// Called hourly by the metrics collector background goroutine.
|
||||
func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
start := deleteOlderThan.Unix()
|
||||
end := downsampleBefore.Unix()
|
||||
if end <= start {
|
||||
return nil
|
||||
}
|
||||
// For each table: delete rows in [start, end) whose ts is NOT the minimum
|
||||
// ts in its 60-second bucket (ts/60 integer division = bucket ID).
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
_, err := m.db.Exec(`
|
||||
DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
|
||||
AND ts NOT IN (
|
||||
SELECT MIN(ts) FROM `+table+`
|
||||
WHERE ts >= ? AND ts < ?
|
||||
GROUP BY ts / 60
|
||||
)`, start, end, start, end)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Prune deletes all rows older than the given cutoff from every metrics table.
|
||||
// Called hourly by the metrics collector to keep the DB size bounded.
|
||||
func (m *MetricsDB) Prune(before time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
cutTS := before.Unix()
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// LoadBetween returns samples in chronological order within the given time window.
|
||||
@@ -183,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
|
||||
start, end = end, start
|
||||
}
|
||||
return m.loadSamples(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
start.Unix(), end.Unix(),
|
||||
)
|
||||
}
|
||||
@@ -199,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
powerSource string
|
||||
powerMode string
|
||||
powerReason string
|
||||
}
|
||||
var sysRows []sysRow
|
||||
for rows.Next() {
|
||||
var r sysRow
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
|
||||
continue
|
||||
}
|
||||
sysRows = append(sysRows, r)
|
||||
@@ -313,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
s := platform.LiveMetricSample{
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
PowerSource: r.powerSource,
|
||||
PowerMode: r.powerMode,
|
||||
PowerReason: r.powerReason,
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||
|
||||
368
audit/internal/webui/nvme_format.go
Normal file
368
audit/internal/webui/nvme_format.go
Normal file
@@ -0,0 +1,368 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type nvmeFormatMode struct {
|
||||
Mode int `json:"mode"`
|
||||
DataBytes int64 `json:"data_bytes"`
|
||||
MetadataBytes int64 `json:"metadata_bytes"`
|
||||
InUse bool `json:"in_use"`
|
||||
Label string `json:"label"`
|
||||
}
|
||||
|
||||
type nvmeFormatDisk struct {
|
||||
Device string `json:"device"`
|
||||
Model string `json:"model,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
Size string `json:"size,omitempty"`
|
||||
CurrentMode int `json:"current_mode"`
|
||||
CurrentFormat string `json:"current_format"`
|
||||
Modes []nvmeFormatMode `json:"modes"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type nvmeListJSON struct {
|
||||
Devices []struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
ModelNumber string `json:"ModelNumber"`
|
||||
SerialNumber string `json:"SerialNumber"`
|
||||
PhysicalSize int64 `json:"PhysicalSize"`
|
||||
} `json:"Devices"`
|
||||
}
|
||||
|
||||
var (
|
||||
nvmeFormatDeviceRE = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
|
||||
nvmeLBAFCompactLineRE = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
|
||||
nvmeLBAFVerboseLineRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
|
||||
nvmeCommandContext = exec.CommandContext
|
||||
nvmeListFormatsTimeout = 20 * time.Second
|
||||
)
|
||||
|
||||
func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
|
||||
defer cancel()
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var root nvmeListJSON
|
||||
if err := json.Unmarshal(out, &root); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
disks := make([]nvmeFormatDisk, 0, len(root.Devices))
|
||||
seen := map[string]struct{}{}
|
||||
for _, dev := range root.Devices {
|
||||
path := strings.TrimSpace(dev.DevicePath)
|
||||
if !nvmeFormatDeviceRE.MatchString(path) {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[path]; ok {
|
||||
continue
|
||||
}
|
||||
seen[path] = struct{}{}
|
||||
disk := nvmeFormatDisk{
|
||||
Device: path,
|
||||
Model: strings.TrimSpace(dev.ModelNumber),
|
||||
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||
Size: formatNVMeBytes(dev.PhysicalSize),
|
||||
CurrentMode: -1,
|
||||
}
|
||||
modes, parseErr := readNVMeFormatModes(ctx, path)
|
||||
if parseErr != nil {
|
||||
disk.Error = parseErr.Error()
|
||||
}
|
||||
disk.Modes = modes
|
||||
for _, mode := range modes {
|
||||
if mode.InUse {
|
||||
disk.CurrentMode = mode.Mode
|
||||
disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
|
||||
break
|
||||
}
|
||||
}
|
||||
disks = append(disks, disk)
|
||||
}
|
||||
sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
|
||||
return disks, nil
|
||||
}
|
||||
|
||||
func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return nil, fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
return nil, fmt.Errorf("%s", msg)
|
||||
}
|
||||
modes := parseNVMeFormatModes(string(out))
|
||||
if len(modes) == 0 {
|
||||
return nil, fmt.Errorf("no LBA format modes found")
|
||||
}
|
||||
return modes, nil
|
||||
}
|
||||
|
||||
func parseNVMeFormatModes(raw string) []nvmeFormatMode {
|
||||
byMode := map[int]nvmeFormatMode{}
|
||||
for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
lbads, errLBADS := strconv.Atoi(m[3])
|
||||
if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
|
||||
continue
|
||||
}
|
||||
data := int64(1) << lbads
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
data, errData := strconv.ParseInt(m[3], 10, 64)
|
||||
if errMode != nil || errMS != nil || errData != nil || data <= 0 {
|
||||
continue
|
||||
}
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
modes := make([]nvmeFormatMode, 0, len(byMode))
|
||||
for _, mode := range byMode {
|
||||
modes = append(modes, mode)
|
||||
}
|
||||
sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
|
||||
return modes
|
||||
}
|
||||
|
||||
func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
modes, err := readNVMeFormatModes(ctx, device)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var selected nvmeFormatMode
|
||||
found := false
|
||||
for _, mode := range modes {
|
||||
if mode.Mode == lbaf {
|
||||
selected = mode
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
|
||||
}
|
||||
ms := 0
|
||||
if selected.MetadataBytes > 0 {
|
||||
ms = 1
|
||||
}
|
||||
j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
|
||||
cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, disks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
LBAF int `json:"lbaf"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if !nvmeFormatDeviceRE.MatchString(req.Device) {
|
||||
writeError(w, http.StatusBadRequest, "invalid NVMe device")
|
||||
return
|
||||
}
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
var label string
|
||||
allowed := false
|
||||
for _, disk := range disks {
|
||||
if disk.Device != req.Device {
|
||||
continue
|
||||
}
|
||||
for _, mode := range disk.Modes {
|
||||
if mode.Mode == req.LBAF {
|
||||
allowed = true
|
||||
label = mode.Label
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
|
||||
return
|
||||
}
|
||||
name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
|
||||
t := &Task{
|
||||
ID: newJobID("nvme-format"),
|
||||
Name: name,
|
||||
Target: "nvme-format",
|
||||
Priority: defaultTaskPriority("nvme-format", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Device: req.Device,
|
||||
LBAF: req.LBAF,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
func formatNVMeBlock(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func formatNVMeBytes(n int64) string {
|
||||
if n <= 0 {
|
||||
return ""
|
||||
}
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
v := float64(n)
|
||||
unit := 0
|
||||
for v >= 1000 && unit < len(units)-1 {
|
||||
v /= 1000
|
||||
unit++
|
||||
}
|
||||
if unit == 0 {
|
||||
return fmt.Sprintf("%d B", n)
|
||||
}
|
||||
return fmt.Sprintf("%.1f %s", v, units[unit])
|
||||
}
|
||||
|
||||
func renderNVMeFormatInline() string {
|
||||
return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
|
||||
<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<script>
|
||||
function nvmeFormatEsc(s) {
|
||||
return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
|
||||
return {'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];
|
||||
});
|
||||
}
|
||||
function loadNVMeFormats() {
|
||||
var status = document.getElementById('nvme-format-status');
|
||||
var table = document.getElementById('nvme-format-table');
|
||||
status.textContent = 'Loading NVMe disks...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
|
||||
window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
|
||||
if (!window._nvmeFormatDisks.length) {
|
||||
status.textContent = 'No NVMe disks found.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
|
||||
var rows = window._nvmeFormatDisks.map(function(d, idx) {
|
||||
var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
|
||||
var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
|
||||
var options = (d.modes || []).map(function(m) {
|
||||
return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
|
||||
}).join('');
|
||||
var disabled = options ? '' : ' disabled';
|
||||
var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
|
||||
return '<tr>'
|
||||
+ '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
|
||||
+ '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
|
||||
+ '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(function(e) {
|
||||
status.textContent = 'Error loading NVMe disks: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
function nvmeWaitTaskDone(taskID, rowMsg) {
|
||||
var timer = setInterval(function() {
|
||||
fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
|
||||
var task = (tasks || []).find(function(t) { return t.id === taskID; });
|
||||
if (!task) return;
|
||||
if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
|
||||
clearInterval(timer);
|
||||
rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
|
||||
rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
|
||||
loadNVMeFormats();
|
||||
}
|
||||
}).catch(function(){});
|
||||
}, 1500);
|
||||
}
|
||||
function nvmeFormatRun(idx, btn) {
|
||||
var disk = (window._nvmeFormatDisks || [])[idx];
|
||||
var select = document.getElementById('nvme-format-select-' + idx);
|
||||
var row = btn.closest('td');
|
||||
var rowMsg = row.querySelector('.nvme-format-row-msg');
|
||||
if (!disk || !select) return;
|
||||
var lbaf = parseInt(select.value, 10);
|
||||
var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
|
||||
if (!mode) return;
|
||||
if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
|
||||
btn.disabled = true;
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Queued...';
|
||||
fetch('/api/tools/nvme-format/run', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({device: disk.device, lbaf: lbaf})
|
||||
}).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
|
||||
rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
|
||||
nvmeWaitTaskDone(d.task_id, rowMsg);
|
||||
}).catch(function(e) {
|
||||
rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
rowMsg.textContent = 'Error: ' + e.message;
|
||||
}).finally(function() {
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadNVMeFormats();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNVMeFormatCard() string {
|
||||
return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">↻ Refresh</button></div><div class="card-body">` +
|
||||
`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
|
||||
renderNVMeFormatInline() + `</div></div>`
|
||||
}
|
||||
617
audit/internal/webui/page_benchmark.go
Normal file
617
audit/internal/webui/page_benchmark.go
Normal file
@@ -0,0 +1,617 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
type benchmarkHistoryRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
gpuScores map[int]float64
|
||||
gpuStatuses map[int]string
|
||||
overallStatus string
|
||||
}
|
||||
|
||||
func renderBenchmark(opts HandlerOptions) string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="grid2">
|
||||
<div class="card">
|
||||
<div class="card-head">Benchmark Setup</div>
|
||||
<div class="card-body">
|
||||
<div class="form-row">
|
||||
<label>Profile</label>
|
||||
<select id="benchmark-profile">
|
||||
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||||
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||||
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label>GPU Selection</label>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
</div>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Sequential — one GPU at a time</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row" id="benchmark-parallel-label">
|
||||
<input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Parallel — all selected GPUs simultaneously</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row" id="benchmark-ramp-label">
|
||||
<input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
|
||||
</label>
|
||||
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
|
||||
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>▶ Run Performance Benchmark</button>
|
||||
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>▶ Run Power / Thermal Fit</button>
|
||||
<button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
|
||||
</div>
|
||||
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||||
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
||||
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Method Split</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||||
<table>
|
||||
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||||
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||||
<tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||||
</table>
|
||||
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
|
||||
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let benchmarkES = null;
|
||||
function benchmarkTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
function benchmarkSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function benchmarkMode() {
|
||||
const el = document.querySelector('input[name="benchmark-mode"]:checked');
|
||||
return el ? el.value : 'sequential';
|
||||
}
|
||||
function benchmarkUpdateSelectionNote() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const perfBtn = document.getElementById('benchmark-run-performance-btn');
|
||||
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
|
||||
const note = document.getElementById('benchmark-selection-note');
|
||||
if (!selected.length) {
|
||||
perfBtn.disabled = true;
|
||||
fitBtn.disabled = true;
|
||||
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||||
return;
|
||||
}
|
||||
perfBtn.disabled = false;
|
||||
fitBtn.disabled = false;
|
||||
const mode = benchmarkMode();
|
||||
if (mode === 'ramp-up') {
|
||||
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
|
||||
} else if (mode === 'parallel') {
|
||||
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
|
||||
} else {
|
||||
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
|
||||
}
|
||||
}
|
||||
function benchmarkRenderGPUList(gpus) {
|
||||
const root = document.getElementById('benchmark-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="benchmark-gpu-row">'
|
||||
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
benchmarkApplyMultiGPUState(gpus.length);
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkApplyMultiGPUState(gpuCount) {
|
||||
var multiValues = ['parallel', 'ramp-up'];
|
||||
var radios = document.querySelectorAll('input[name="benchmark-mode"]');
|
||||
radios.forEach(function(el) {
|
||||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||
if (gpuCount < 2 && isMulti) {
|
||||
el.disabled = true;
|
||||
if (el.checked) {
|
||||
var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
|
||||
if (seq) seq.checked = true;
|
||||
}
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '0.4';
|
||||
} else {
|
||||
el.disabled = false;
|
||||
if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '';
|
||||
}
|
||||
});
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkLoadGPUs() {
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
status.textContent = '';
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
benchmarkRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
function benchmarkSelectAll() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkSelectNone() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function runNvidiaBenchmark(kind) {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
if (!selected.length) {
|
||||
status.textContent = 'Select at least one GPU.';
|
||||
return;
|
||||
}
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
const mode = benchmarkMode();
|
||||
const rampUp = mode === 'ramp-up' && selected.length > 1;
|
||||
const parallelGPUs = mode === 'parallel' && kind === 'performance';
|
||||
if (kind === 'power-fit' && mode === 'parallel') {
|
||||
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
|
||||
return;
|
||||
}
|
||||
const body = {
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
gpu_indices: selected,
|
||||
run_nccl: kind === 'performance' && selected.length > 1,
|
||||
parallel_gpus: parallelGPUs,
|
||||
ramp_up: rampUp,
|
||||
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
|
||||
};
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
|
||||
status.textContent = 'Queueing...';
|
||||
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
|
||||
fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||
return;
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
if (e.data) failures += 1;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
const isLast = (idx + 1 >= taskIds.length);
|
||||
streamNext(idx + 1, failures);
|
||||
if (isLast) { benchmarkRefreshResults(); }
|
||||
});
|
||||
benchmarkES.onerror = function() {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
streamNext(idx + 1, failures + 1);
|
||||
};
|
||||
};
|
||||
streamNext(0, 0);
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
function benchmarkRenderAutotuneStatus(payload) {
|
||||
const el = document.getElementById('benchmark-autotune-status');
|
||||
if (!el) return;
|
||||
if (!payload || !payload.configured || !payload.config) {
|
||||
el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
|
||||
return;
|
||||
}
|
||||
const cfg = payload.config || {};
|
||||
const decision = payload.decision || {};
|
||||
const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
|
||||
const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
|
||||
const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
|
||||
const mode = decision.mode ? (' · mode ' + decision.mode) : '';
|
||||
el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
|
||||
}
|
||||
function loadBenchmarkAutotuneStatus() {
|
||||
fetch('/api/bee-bench/nvidia/autotune/status')
|
||||
.then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
})
|
||||
.then(function(body) { benchmarkRenderAutotuneStatus(body); })
|
||||
.catch(function(err) {
|
||||
const el = document.getElementById('benchmark-autotune-status');
|
||||
if (el) el.textContent = 'Autotune status error: ' + err.message;
|
||||
});
|
||||
}
|
||||
function runBenchmarkAutotune() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
|
||||
term.textContent = 'Enqueuing benchmark autotune...\n';
|
||||
status.textContent = 'Queueing autotune...';
|
||||
fetch('/api/bee-bench/nvidia/autotune/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify({
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
|
||||
gpu_indices: selected
|
||||
})
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No autotune task was queued.');
|
||||
const taskId = taskIds[0];
|
||||
status.textContent = 'Autotune queued: ' + taskId;
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
|
||||
loadBenchmarkAutotuneStatus();
|
||||
});
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Autotune error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
benchmarkLoadGPUs();
|
||||
loadBenchmarkAutotuneStatus();
|
||||
function benchmarkRefreshResults() {
|
||||
fetch('/api/benchmark/results')
|
||||
.then(function(r) { return r.text(); })
|
||||
.then(function(html) {
|
||||
const el = document.getElementById('benchmark-results-section');
|
||||
if (el) el.innerHTML = html;
|
||||
})
|
||||
.catch(function() {});
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCard(exportDir string) string {
|
||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||
perf := renderBenchmarkResultsCardFromRuns(
|
||||
"Perf Results",
|
||||
"Composite score by saved benchmark run and GPU.",
|
||||
"No saved performance benchmark runs yet.",
|
||||
maxIdx,
|
||||
runs,
|
||||
)
|
||||
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||
return perf + "\n" + power
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||
if len(runs) == 0 {
|
||||
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<div style="overflow-x:auto">`)
|
||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||
for i := 0; i <= maxGPUIndex; i++ {
|
||||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||
}
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
overallColor := "var(--ok)"
|
||||
overallLabel := run.overallStatus
|
||||
if overallLabel == "" {
|
||||
overallLabel = "OK"
|
||||
}
|
||||
if overallLabel == "FAILED" {
|
||||
overallColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if overallLabel != "OK" {
|
||||
overallColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||
score, ok := run.gpuScores[idx]
|
||||
if !ok {
|
||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||
continue
|
||||
}
|
||||
gpuStatus := run.gpuStatuses[idx]
|
||||
scoreColor := ""
|
||||
switch gpuStatus {
|
||||
case "FAILED":
|
||||
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||
case "WARNING", "PARTIAL":
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
case "", "OK":
|
||||
default:
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
}
|
||||
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||
}
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
|
||||
baseDir := app.DefaultBeeBenchPerfDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return -1, nil
|
||||
}
|
||||
sort.Strings(paths)
|
||||
return loadBenchmarkHistoryFromPaths(paths)
|
||||
}
|
||||
|
||||
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
|
||||
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||
maxGPUIndex := -1
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var result platform.NvidiaBenchmarkResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
continue
|
||||
}
|
||||
run := benchmarkHistoryRun{
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
gpuScores: make(map[int]float64),
|
||||
gpuStatuses: make(map[int]string),
|
||||
overallStatus: result.OverallStatus,
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||
if gpu.Index > maxGPUIndex {
|
||||
maxGPUIndex = gpu.Index
|
||||
}
|
||||
}
|
||||
runs = append(runs, run)
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
return maxGPUIndex, runs
|
||||
}
|
||||
|
||||
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
baseDir := app.DefaultBeeBenchPowerDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
type powerRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
result platform.NvidiaPowerBenchResult
|
||||
}
|
||||
var runs []powerRun
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var r platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &r); err != nil {
|
||||
continue
|
||||
}
|
||||
runs = append(runs, powerRun{
|
||||
generatedAt: r.GeneratedAt,
|
||||
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
result: r,
|
||||
})
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||
|
||||
latest := runs[0].result
|
||||
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||
if latest.Hostname != "" {
|
||||
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||
}
|
||||
if latest.OverallStatus != "" {
|
||||
statusColor := "var(--ok)"
|
||||
if latest.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||
}
|
||||
b.WriteString(`</p>`)
|
||||
|
||||
if len(latest.GPUs) > 0 {
|
||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for _, gpu := range latest.GPUs {
|
||||
finalLimitW := gpu.StablePowerLimitW
|
||||
if finalLimitW <= 0 {
|
||||
finalLimitW = gpu.AppliedPowerLimitW
|
||||
}
|
||||
derated := gpu.Derated ||
|
||||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||
rowStyle := ""
|
||||
finalStyle := ""
|
||||
if derated {
|
||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||
}
|
||||
statusLabel := gpu.Status
|
||||
if statusLabel == "" {
|
||||
statusLabel = "OK"
|
||||
}
|
||||
statusColor := "var(--ok)"
|
||||
if statusLabel == "FAILED" {
|
||||
statusColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if statusLabel != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
nominalStr := "-"
|
||||
if gpu.DefaultPowerLimitW > 0 {
|
||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||
}
|
||||
singleStr := "-"
|
||||
if gpu.AppliedPowerLimitW > 0 {
|
||||
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||
}
|
||||
multiStr := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||
}
|
||||
p95Str := "-"
|
||||
if gpu.MaxObservedPowerW > 0 {
|
||||
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||
}
|
||||
b.WriteString(`<tr` + rowStyle + `>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div>`)
|
||||
}
|
||||
|
||||
if len(runs) > 1 {
|
||||
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
statusColor := "var(--ok)"
|
||||
if run.result.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></details>`)
|
||||
}
|
||||
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// renderSpeed and renderEndurance are legacy wrappers; canonical page is 5. Benchmark at /benchmark.
|
||||
func renderSpeed(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||
func renderEndurance(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||
383
audit/internal/webui/page_burn.go
Normal file
383
audit/internal/webui/page_burn.go
Normal file
@@ -0,0 +1,383 @@
|
||||
package webui
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn runs sustained GPU compute and CPU/memory stress recipes. DCGM targeted diagnostics (<code>targeted_stress</code>, <code>targeted_power</code>, <code>pulse_test</code>) and NCCL/NVBandwidth are on the <a href="/load">3. Load</a> page. For performance benchmarks, see <a href="/benchmark">5. Benchmark</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Burn Profile</div>
|
||||
<div class="card-body burn-profile-body">
|
||||
<div class="burn-profile-col">
|
||||
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
<div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
|
||||
<label class="cb-row">
|
||||
<input type="radio" name="burn-nvidia-mode" value="sequential" checked>
|
||||
<span>Sequential — selected GPUs one at a time</span>
|
||||
</label>
|
||||
<label class="cb-row" id="burn-parallel-label">
|
||||
<input type="radio" name="burn-nvidia-mode" value="parallel">
|
||||
<span>Parallel — all selected GPUs simultaneously</span>
|
||||
</label>
|
||||
<label class="cb-row" id="burn-ramp-label">
|
||||
<input type="radio" name="burn-nvidia-mode" value="ramp-up">
|
||||
<span>Ramp-up — add one GPU at a time</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">Core Burn Paths</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||
.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.burn-profile-col { min-width:0; }
|
||||
.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
|
||||
.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
|
||||
.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
|
||||
.burn-grid { align-items:stretch; }
|
||||
.burn-card { height:100%; display:flex; flex-direction:column; }
|
||||
.burn-card-body { flex:1; display:flex; flex-direction:column; }
|
||||
.card-head-actions { justify-content:space-between; }
|
||||
.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
|
||||
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let biES = null;
|
||||
function burnTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
function burnProfile() {
|
||||
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||
return selected ? selected.value : 'smoke';
|
||||
}
|
||||
function burnSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function burnNvidiaMode() {
|
||||
const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
|
||||
return el ? el.value : 'sequential';
|
||||
}
|
||||
function burnApplyMultiGPUState(gpuCount) {
|
||||
var multiValues = ['parallel', 'ramp-up'];
|
||||
var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
|
||||
radios.forEach(function(el) {
|
||||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||
if (gpuCount < 2 && isMulti) {
|
||||
el.disabled = true;
|
||||
if (el.checked) {
|
||||
var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
|
||||
if (seq) seq.checked = true;
|
||||
}
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '0.4';
|
||||
} else {
|
||||
el.disabled = false;
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function burnUpdateSelectionNote() {
|
||||
const note = document.getElementById('burn-selection-note');
|
||||
const selected = burnSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
|
||||
}
|
||||
function burnRenderGPUList(gpus) {
|
||||
const root = document.getElementById('burn-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
burnUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="burn-gpu-row">'
|
||||
+ '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
burnApplyMultiGPUState(gpus.length);
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnSelectAll() {
|
||||
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnSelectNone() {
|
||||
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnLoadGPUs() {
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
burnRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
burnUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||
const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
|
||||
if (useSelectedNvidia) {
|
||||
const selected = burnSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
body.gpu_indices = selected;
|
||||
const bMode = burnNvidiaMode();
|
||||
if (bMode === 'ramp-up' && selected.length > 1) {
|
||||
body.stagger_gpu_start = true;
|
||||
} else if (bMode === 'parallel' && selected.length > 1) {
|
||||
body.parallel_gpus = true;
|
||||
}
|
||||
}
|
||||
return fetch('/api/sat/' + target + '/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
});
|
||||
}
|
||||
function streamTask(taskId, label) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
});
|
||||
}
|
||||
function streamBurnTask(taskId, label, resetTerminal) {
|
||||
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||
}
|
||||
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||
term.textContent += 'ERROR: no tasks queued.\n';
|
||||
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||
}
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||
return new Promise(function(resolve) {
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + (e.data ? 1 : 0));
|
||||
});
|
||||
biES.onerror = function() {
|
||||
if (biES) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + 1);
|
||||
};
|
||||
}).then(function(nextFailures) {
|
||||
return streamNext(idx + 1, nextFailures);
|
||||
});
|
||||
};
|
||||
return streamNext(0, 0);
|
||||
}
|
||||
function runBurnTaskSet(tasks, statusElId) {
|
||||
const enabled = tasks.filter(function(t) {
|
||||
const el = document.getElementById(t.id);
|
||||
return el && el.checked && !el.disabled;
|
||||
});
|
||||
const status = statusElId ? document.getElementById(statusElId) : null;
|
||||
if (status) status.textContent = '';
|
||||
if (!enabled.length) {
|
||||
if (status) status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const term = document.getElementById('bi-terminal');
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
|
||||
term.textContent = '';
|
||||
const runNext = function(idx) {
|
||||
if (idx >= enabled.length) {
|
||||
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
|
||||
return Promise.resolve();
|
||||
}
|
||||
const t = enabled[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
|
||||
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||
.then(function(d) {
|
||||
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||
})
|
||||
.then(function() {
|
||||
return runNext(idx + 1);
|
||||
})
|
||||
.catch(function(err) {
|
||||
if (status) status.textContent = 'Error: ' + err.message;
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
return Promise.reject(err);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runPlatformStress() {
|
||||
const comps = [];
|
||||
const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
|
||||
const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
|
||||
const hasChecked = function(ids) {
|
||||
return ids.some(function(id) {
|
||||
const el = document.getElementById(id);
|
||||
return el && el.checked && !el.disabled;
|
||||
});
|
||||
};
|
||||
if (hasChecked(computeIDs)) comps.push('cpu');
|
||||
if (hasChecked(gpuIDs)) comps.push('gpu');
|
||||
if (!comps.length) {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
|
||||
return;
|
||||
}
|
||||
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||
enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
|
||||
streamTask(d.task_id, 'Platform Thermal Cycling');
|
||||
});
|
||||
}
|
||||
function runAllBurnTasks() {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
const all = [
|
||||
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||
{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
|
||||
{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
|
||||
{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
|
||||
];
|
||||
status.textContent = 'Enqueuing...';
|
||||
runBurnTaskSet(all, 'burn-all-status');
|
||||
}
|
||||
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||
const map = {
|
||||
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||
};
|
||||
tools.forEach(function(t) {
|
||||
const spec = map[t.id];
|
||||
if (!spec) return;
|
||||
const cb = document.getElementById(spec.cb);
|
||||
const note = document.getElementById(spec.note);
|
||||
if (!cb) return;
|
||||
if (t.available) {
|
||||
cb.disabled = false;
|
||||
} else if (note) {
|
||||
note.textContent = '— ' + spec.reason;
|
||||
}
|
||||
});
|
||||
}).catch(function() {});
|
||||
burnLoadGPUs();
|
||||
</script>`
|
||||
}
|
||||
644
audit/internal/webui/page_export_tools.go
Normal file
644
audit/internal/webui/page_export_tools.go
Normal file
@@ -0,0 +1,644 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func renderExport(exportDir string) string {
|
||||
entries, _ := listExportFiles(exportDir)
|
||||
var rows strings.Builder
|
||||
for _, e := range entries {
|
||||
rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
|
||||
url.QueryEscape(e), html.EscapeString(e)))
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
|
||||
}
|
||||
return `<div class="grid2">
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||
</div></div>
|
||||
</div>
|
||||
|
||||
` + renderUSBExportCard()
|
||||
}
|
||||
|
||||
func listExportFiles(exportDir string) ([]string, error) {
|
||||
var entries []string
|
||||
err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
rel, err := filepath.Rel(exportDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries = append(entries, rel)
|
||||
return nil
|
||||
})
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
sort.Strings(entries)
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
func renderSupportBundleInline() string {
|
||||
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">↓ Download Support Bundle</button>
|
||||
<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
|
||||
<script>
|
||||
window.supportBundleDownload = function() {
|
||||
var btn = document.getElementById('support-bundle-btn');
|
||||
var status = document.getElementById('support-bundle-status');
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Building...';
|
||||
status.textContent = 'Collecting logs and export data\u2026';
|
||||
status.style.color = 'var(--muted)';
|
||||
var filename = 'bee-support.tar.gz';
|
||||
fetch('/export/support.tar.gz')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('HTTP ' + r.status);
|
||||
var cd = r.headers.get('Content-Disposition') || '';
|
||||
var m = cd.match(/filename="?([^";]+)"?/);
|
||||
if (m) filename = m[1];
|
||||
return r.blob();
|
||||
})
|
||||
.then(function(blob) {
|
||||
var url = URL.createObjectURL(blob);
|
||||
var a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
status.textContent = 'Download started.';
|
||||
status.style.color = 'var(--ok-fg)';
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
})
|
||||
.finally(function() {
|
||||
btn.disabled = false;
|
||||
btn.textContent = '\u2195 Download Support Bundle';
|
||||
});
|
||||
};
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderUSBExportCard() string {
|
||||
return `<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">USB Black-Box
|
||||
<button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||
</div>`
|
||||
}
|
||||
|
||||
func renderUSBExportInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function blackboxRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
Promise.all([
|
||||
fetch('/api/export/usb').then(r=>r.json()),
|
||||
fetch('/api/blackbox/status').then(r=>r.json())
|
||||
]).then(function(values) {
|
||||
const targets = Array.isArray(values[0]) ? values[0] : [];
|
||||
const state = values[1] || {};
|
||||
const active = Array.isArray(state.targets) ? state.targets : [];
|
||||
window._usbTargets = targets;
|
||||
window._blackboxTargets = active;
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
const summary = document.getElementById('blackbox-summary');
|
||||
if (state.boot_folder) {
|
||||
summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
|
||||
} else {
|
||||
summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
|
||||
}
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
} else {
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
}
|
||||
const byDevice = {};
|
||||
active.forEach(function(item) { byDevice[item.device] = item; });
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
|
||||
targets.map((t, idx) => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
const state = byDevice[dev];
|
||||
const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
|
||||
const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="font-size:12px">'+status+detail+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
(state
|
||||
? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
|
||||
: '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
|
||||
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
}).catch(e => {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.blackboxEnable = function(targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
if (!target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: USB target not found. Refresh and try again.';
|
||||
return;
|
||||
}
|
||||
const msg = document.getElementById('usb-msg');
|
||||
const row = btn ? btn.closest('td') : null;
|
||||
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Enabling...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
|
||||
fetch('/api/blackbox/enable', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
}).then(async r => {
|
||||
const d = await r.json();
|
||||
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||
return d;
|
||||
}).then(d => {
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--ok,green)';
|
||||
rowMsg.textContent = d.message || 'Done.';
|
||||
}
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--err,red)';
|
||||
rowMsg.textContent = 'Error: ' + e;
|
||||
}
|
||||
}).finally(() => {
|
||||
if (btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
setTimeout(blackboxRefresh, 300);
|
||||
});
|
||||
};
|
||||
window.blackboxDisable = function(targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
|
||||
if (!target || !active) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: black-box target not found. Refresh and try again.';
|
||||
return;
|
||||
}
|
||||
const msg = document.getElementById('usb-msg');
|
||||
const row = btn ? btn.closest('td') : null;
|
||||
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Disabling...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
|
||||
fetch('/api/blackbox/disable', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
|
||||
}).then(async r => {
|
||||
const d = await r.json();
|
||||
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||
return d;
|
||||
}).then(d => {
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--ok,green)';
|
||||
rowMsg.textContent = d.message || 'Done.';
|
||||
}
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--err,red)';
|
||||
rowMsg.textContent = 'Error: ' + e;
|
||||
}
|
||||
}).finally(() => {
|
||||
if (btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
setTimeout(blackboxRefresh, 300);
|
||||
});
|
||||
};
|
||||
window.blackboxRefresh = blackboxRefresh;
|
||||
blackboxRefresh();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNvidiaSelfHealInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
|
||||
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
|
||||
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">↻ Refresh</button>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
|
||||
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function nvidiaSelfHealShowResult(label, status, output) {
|
||||
var out = document.getElementById('nvidia-self-heal-out');
|
||||
var term = document.getElementById('nvidia-self-heal-terminal');
|
||||
var statusEl = document.getElementById('nvidia-self-heal-out-status');
|
||||
var labelEl = document.getElementById('nvidia-self-heal-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = label;
|
||||
term.textContent = output || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
}
|
||||
function nvidiaRestartDrivers() {
|
||||
var btn = document.getElementById('nvidia-restart-btn');
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Restarting...';
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
|
||||
fetch('/api/services/action', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
|
||||
setTimeout(function() {
|
||||
loadServices();
|
||||
loadNvidiaSelfHeal();
|
||||
}, 800);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function nvidiaResetGPU(index, btn) {
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Resetting...';
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
|
||||
fetch('/api/gpu/nvidia-reset', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({index:index})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
|
||||
setTimeout(loadNvidiaSelfHeal, 1000);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function loadNvidiaSelfHeal() {
|
||||
var status = document.getElementById('nvidia-self-heal-status');
|
||||
var table = document.getElementById('nvidia-self-heal-table');
|
||||
status.textContent = 'Loading NVIDIA GPU status...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
|
||||
if (!Array.isArray(gpus) || gpus.length === 0) {
|
||||
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
|
||||
const rows = gpus.map(g => {
|
||||
const serial = g.serial || '';
|
||||
const bdf = g.bdf || '';
|
||||
const id = serial || bdf || ('gpu-' + g.index);
|
||||
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
|
||||
const details = [];
|
||||
if (serial) details.push('serial ' + serial);
|
||||
if (bdf) details.push('bdf ' + bdf);
|
||||
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
|
||||
return '<tr>'
|
||||
+ '<td style="white-space:nowrap">' + g.index + '</td>'
|
||||
+ '<td>' + (g.name || 'unknown') + '</td>'
|
||||
+ '<td style="font-family:monospace">' + id + '</td>'
|
||||
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
|
||||
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
|
||||
+ '</td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(e => {
|
||||
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
|
||||
status.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
loadNvidiaSelfHeal();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderTools() string {
|
||||
return renderNVMeFormatCard() + `
|
||||
|
||||
` + renderFRUEditorCard() + `
|
||||
|
||||
` + renderRAIDMgmtCard()
|
||||
}
|
||||
|
||||
func renderFRUEditorCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">FRU / Elabel<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="fruAllRead()">Read All</button></div></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits hardware identity fields from all available sources. Each field shows its source method.</p>
|
||||
<div id="fru-all-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="fru-src-status" style="display:none;margin-bottom:10px"></div>
|
||||
<div id="fru-all-table"></div>
|
||||
</div></div>
|
||||
<style>
|
||||
.fru-chip{display:inline-block;font-size:10px;font-weight:600;letter-spacing:.02em;padding:1px 6px;border-radius:3px;vertical-align:middle;white-space:nowrap;margin-right:8px;flex-shrink:0}
|
||||
.fru-chip-ipmi{background:#e8e8e8;color:#555}
|
||||
.fru-chip-huawei{background:#fff0e6;color:#b83}
|
||||
.fru-chip-saa{background:#e6f0ff;color:#557}
|
||||
.fru-inp-wrap{display:flex;align-items:center;gap:0}
|
||||
</style>
|
||||
<script>
|
||||
(function(){
|
||||
var _actBtn='width:22px;height:22px;padding:0;font-size:13px;line-height:1;border:1px solid var(--line);border-radius:3px;background:var(--surface);cursor:pointer;vertical-align:middle;';
|
||||
var _inp='width:100%;padding:3px 6px;border:1.5px solid #888;border-radius:3px;font-size:13px;font-family:monospace;background:var(--surface);color:var(--ink);';
|
||||
|
||||
var SOURCES = [
|
||||
{
|
||||
id: 'ipmi-fru',
|
||||
label: 'IPMI FRU',
|
||||
chipClass: 'fru-chip-ipmi',
|
||||
url: '/api/tools/ipmi-fru',
|
||||
writeUrl: '/api/tools/ipmi-fru/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="ipmi-fru" data-area="'+esc(f.area||'')+'" data-index="'+(f.index||0)+'" data-name="'+esc(f.name)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{area:inp.dataset.area,index:parseInt(inp.dataset.index,10),name:inp.dataset.name,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
{
|
||||
id: 'huawei',
|
||||
label: 'Huawei iBMC',
|
||||
chipClass: 'fru-chip-huawei',
|
||||
url: '/api/tools/huawei-elabel',
|
||||
writeUrl: '/api/tools/huawei-elabel/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="huawei" data-key="'+esc(f.key)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{key:inp.dataset.key,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return !!f.read_only; },
|
||||
},
|
||||
{
|
||||
id: 'saa-dmi',
|
||||
label: 'SAA DMI',
|
||||
chipClass: 'fru-chip-saa',
|
||||
url: '/api/tools/saa-dmi',
|
||||
writeUrl: '/api/tools/saa-dmi/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="saa-dmi" data-shn="'+esc(f.shn)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{shn:inp.dataset.shn,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
];
|
||||
|
||||
function esc(s){return String(s==null?'':s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');}
|
||||
|
||||
function renderSrcStatus(perSource) {
|
||||
var bar = document.getElementById('fru-src-status');
|
||||
if (!perSource.length) { bar.style.display = 'none'; bar.innerHTML = ''; return; }
|
||||
var html = '';
|
||||
perSource.forEach(function(p) {
|
||||
var state, color;
|
||||
if (p.ok) {
|
||||
state = p.count + ' field(s) available';
|
||||
color = 'var(--ok-fg,green)';
|
||||
} else if (/not activated|product key|SFT-DCMS|SFT-OOB/i.test(p.reason)) {
|
||||
state = 'requires Supermicro license (SFT-OOB-LIC / SFT-DCMS-SINGLE) — activate on BMC';
|
||||
color = 'var(--crit-fg,#9f3a38)';
|
||||
} else {
|
||||
state = p.reason || 'unavailable';
|
||||
color = 'var(--muted)';
|
||||
}
|
||||
html += '<div style="display:flex;align-items:center;gap:8px;font-size:12px;margin:3px 0">'
|
||||
+ '<span class="fru-chip '+p.src.chipClass+'">'+p.src.label+'</span>'
|
||||
+ '<span style="color:'+color+'">'+esc(state)+'</span>'
|
||||
+ '</div>';
|
||||
});
|
||||
bar.innerHTML = html;
|
||||
bar.style.display = '';
|
||||
}
|
||||
|
||||
window.fruAllRead = function() {
|
||||
var status = document.getElementById('fru-all-status');
|
||||
var table = document.getElementById('fru-all-table');
|
||||
status.textContent = 'Reading…'; status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '';
|
||||
|
||||
var fetches = SOURCES.map(function(src) {
|
||||
return fetch(src.url, {cache:'no-store'})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); });
|
||||
});
|
||||
|
||||
Promise.allSettled(fetches).then(function(results) {
|
||||
var rows = '';
|
||||
var totalFields = 0;
|
||||
var perSource = [];
|
||||
|
||||
results.forEach(function(res, i) {
|
||||
var src = SOURCES[i];
|
||||
if (res.status === 'rejected' || !Array.isArray(res.value) || res.value.length === 0) {
|
||||
var reason = '';
|
||||
if (res.status === 'rejected' && res.reason) reason = res.reason.message;
|
||||
else reason = 'no editable fields returned';
|
||||
perSource.push({src:src, ok:false, count:0, reason:reason});
|
||||
return;
|
||||
}
|
||||
perSource.push({src:src, ok:true, count:res.value.length, reason:''});
|
||||
res.value.forEach(function(f) {
|
||||
var val = esc(src.fieldValue(f));
|
||||
var ro = src.readOnly(f);
|
||||
var attrs = ro ? '' : (' '+src.rowAttrs(f));
|
||||
rows += '<tr>'
|
||||
+ '<td style="white-space:nowrap;padding-right:4px;vertical-align:middle">'
|
||||
+ '<span class="fru-chip '+src.chipClass+'">'+src.label+'</span>'
|
||||
+ '</td>'
|
||||
+ '<td style="color:var(--muted);white-space:nowrap;padding-right:16px;vertical-align:middle;font-size:13px">'+esc(src.fieldName(f))+'</td>'
|
||||
+ '<td style="vertical-align:middle">'
|
||||
+ (ro
|
||||
? '<span style="font-family:monospace;font-size:13px;color:var(--muted)">'+val+'</span>'
|
||||
: '<input class="fru-uni-inp" style="'+_inp+'" value="'+val+'" data-original="'+val+'"'+attrs+' oninput="fruUniChanged(this)">')
|
||||
+ '</td>'
|
||||
+ '<td class="fru-uni-act" style="display:none;white-space:nowrap;padding-left:6px;vertical-align:middle">'
|
||||
+ '<button style="'+_actBtn+'color:var(--ok-fg,green);margin-right:3px" title="Save" onclick="fruUniSave(this)">✓</button>'
|
||||
+ '<button style="'+_actBtn+'color:var(--crit-fg,#9f3a38)" title="Cancel" onclick="fruUniCancel(this)">✗</button>'
|
||||
+ '<span class="fru-uni-msg" style="font-size:11px;margin-left:5px;color:var(--muted)"></span>'
|
||||
+ '</td>'
|
||||
+ '</tr>';
|
||||
totalFields++;
|
||||
});
|
||||
});
|
||||
|
||||
renderSrcStatus(perSource);
|
||||
|
||||
if (totalFields === 0) {
|
||||
status.textContent = 'No editable fields available — see per-source status below.';
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
|
||||
table.innerHTML = '<table style="width:100%;border-collapse:collapse">'+rows+'</table>';
|
||||
status.textContent = totalFields + ' field(s) loaded';
|
||||
status.style.color = 'var(--muted)';
|
||||
});
|
||||
};
|
||||
|
||||
window.fruUniChanged = function(inp) {
|
||||
var row = inp.closest('tr');
|
||||
row.querySelector('.fru-uni-act').style.display = inp.value !== inp.dataset.original ? '' : 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniCancel = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
inp.value = inp.dataset.original;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniSave = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
var msg = row.querySelector('.fru-uni-msg');
|
||||
var cancelBtn = row.querySelectorAll('.fru-uni-act button')[1];
|
||||
var src = SOURCES.find(function(s){ return s.id === inp.dataset.source; });
|
||||
if (!src) { msg.textContent = 'Unknown source'; msg.style.color='var(--crit-fg)'; return; }
|
||||
|
||||
btn.disabled = true; cancelBtn.disabled = true;
|
||||
msg.textContent = '…'; msg.style.color = 'var(--muted)';
|
||||
|
||||
fetch(src.writeUrl, {method:'POST', headers:{'Content-Type':'application/json'}, body:src.writeBody(inp)})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); })
|
||||
.then(function(d) {
|
||||
var poll = setInterval(function() {
|
||||
fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(tasks){
|
||||
var t = Array.isArray(tasks) ? tasks.find(function(x){return x.id===d.task_id;}) : null;
|
||||
if (!t) return;
|
||||
if (t.status==='done') {
|
||||
clearInterval(poll);
|
||||
inp.dataset.original = inp.value;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
msg.textContent = ''; msg.style.color = '';
|
||||
} else if (t.status==='failed'||t.status==='cancelled') {
|
||||
clearInterval(poll);
|
||||
msg.textContent = t.error||t.status; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
}
|
||||
});
|
||||
}, 1500);
|
||||
})
|
||||
.catch(function(e) {
|
||||
msg.textContent = 'Error: '+e.message; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
});
|
||||
};
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderExportIndex(exportDir string) (string, error) {
|
||||
entries, err := listExportFiles(exportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var body strings.Builder
|
||||
body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
|
||||
body.WriteString(`<h1>Bee Export Files</h1><ul>`)
|
||||
for _, entry := range entries {
|
||||
body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
body.WriteString(`<li>No export files found.</li>`)
|
||||
}
|
||||
body.WriteString(`</ul></body></html>`)
|
||||
return body.String(), nil
|
||||
}
|
||||
314
audit/internal/webui/page_install_tasks.go
Normal file
314
audit/internal/webui/page_install_tasks.go
Normal file
@@ -0,0 +1,314 @@
|
||||
package webui
|
||||
|
||||
func renderInstallInline() string {
|
||||
return `
|
||||
<div class="alert alert-warn" style="margin-bottom:16px">
|
||||
<strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
|
||||
disk and write the live system onto it. All existing data on the target disk will be lost.
|
||||
This operation cannot be undone.
|
||||
</div>
|
||||
<div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
|
||||
<div id="install-disk-section" style="display:none">
|
||||
<div class="card" style="margin-bottom:0">
|
||||
<table id="install-disk-table">
|
||||
<thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
|
||||
<tbody id="install-disk-tbody"></tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div style="margin-top:12px">
|
||||
<button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
|
||||
</div>
|
||||
</div>
|
||||
<div id="install-confirm-section" style="display:none;margin-top:20px">
|
||||
<div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
|
||||
<div class="form-row" style="max-width:360px">
|
||||
<label>Type the device name to confirm (e.g. /dev/sda)</label>
|
||||
<input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
|
||||
</div>
|
||||
<button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
|
||||
<button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
|
||||
</div>
|
||||
<div id="install-progress-section" style="display:none;margin-top:20px">
|
||||
<div class="card-head" style="margin-bottom:8px">Installation Progress</div>
|
||||
<div id="install-terminal" class="terminal" style="max-height:500px"></div>
|
||||
<div id="install-status" style="margin-top:12px;font-size:13px"></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
#install-disk-tbody tr{cursor:pointer}
|
||||
#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
|
||||
#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
|
||||
</style>
|
||||
|
||||
<script>
|
||||
var _installSelected = null;
|
||||
|
||||
function installRefreshDisks() {
|
||||
document.getElementById('install-loading').style.display = '';
|
||||
document.getElementById('install-disk-section').style.display = 'none';
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
_installSelected = null;
|
||||
fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
|
||||
document.getElementById('install-loading').style.display = 'none';
|
||||
var tbody = document.getElementById('install-disk-tbody');
|
||||
tbody.innerHTML = '';
|
||||
if (!disks || disks.length === 0) {
|
||||
tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
|
||||
} else {
|
||||
disks.forEach(function(d) {
|
||||
var warnings = (d.warnings || []);
|
||||
var statusHtml;
|
||||
if (warnings.length === 0) {
|
||||
statusHtml = '<span class="badge badge-ok">OK</span>';
|
||||
} else {
|
||||
var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
|
||||
statusHtml = warnings.map(function(w){
|
||||
var cls = hasSmall ? 'badge-err' : 'badge-warn';
|
||||
return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'"') + '">' +
|
||||
(w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
|
||||
}).join(' ');
|
||||
}
|
||||
var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
|
||||
? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
|
||||
var tr = document.createElement('tr');
|
||||
tr.dataset.device = d.device;
|
||||
tr.dataset.model = d.model || 'Unknown';
|
||||
tr.dataset.size = d.size;
|
||||
tr.dataset.warnings = JSON.stringify(warnings);
|
||||
tr.innerHTML =
|
||||
'<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
|
||||
'<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
|
||||
'<td>' + (d.model || '—') + '</td>' +
|
||||
'<td>' + d.size + '</td>' +
|
||||
'<td>' + statusHtml + '</td>';
|
||||
tr.addEventListener('click', function(){ installSelectDisk(this); });
|
||||
tbody.appendChild(tr);
|
||||
});
|
||||
}
|
||||
document.getElementById('install-disk-section').style.display = '';
|
||||
}).catch(function(e){
|
||||
document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
|
||||
});
|
||||
}
|
||||
|
||||
function installSelectDisk(tr) {
|
||||
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||
tr.classList.add('selected');
|
||||
var radio = tr.querySelector('input[type=radio]');
|
||||
if (radio) radio.checked = true;
|
||||
_installSelected = {
|
||||
device: tr.dataset.device,
|
||||
model: tr.dataset.model,
|
||||
size: tr.dataset.size,
|
||||
warnings: JSON.parse(tr.dataset.warnings || '[]')
|
||||
};
|
||||
var warnBox = document.getElementById('install-confirm-warn');
|
||||
var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
|
||||
' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
|
||||
' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
|
||||
if (_installSelected.warnings.length > 0) {
|
||||
warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
|
||||
}
|
||||
warnBox.innerHTML = warnLines;
|
||||
document.getElementById('install-confirm-input').value = '';
|
||||
document.getElementById('install-start-btn').disabled = true;
|
||||
document.getElementById('install-confirm-section').style.display = '';
|
||||
document.getElementById('install-progress-section').style.display = 'none';
|
||||
}
|
||||
|
||||
function installDeselect() {
|
||||
_installSelected = null;
|
||||
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||
document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
}
|
||||
|
||||
function installCheckConfirm() {
|
||||
var val = document.getElementById('install-confirm-input').value.trim();
|
||||
var ok = _installSelected && val === _installSelected.device;
|
||||
document.getElementById('install-start-btn').disabled = !ok;
|
||||
}
|
||||
|
||||
function installStart() {
|
||||
if (!_installSelected) return;
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
document.getElementById('install-disk-section').style.display = 'none';
|
||||
document.getElementById('install-loading').style.display = 'none';
|
||||
var prog = document.getElementById('install-progress-section');
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
prog.style.display = '';
|
||||
term.textContent = '';
|
||||
status.textContent = 'Starting installation…';
|
||||
status.style.color = 'var(--muted)';
|
||||
|
||||
fetch('/api/install/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({device: _installSelected.device})
|
||||
}).then(function(r){
|
||||
return r.json().then(function(j){
|
||||
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||
return j;
|
||||
});
|
||||
}).then(function(j){
|
||||
if (!j.task_id) throw new Error('missing task id');
|
||||
installStreamLog(j.task_id);
|
||||
}).catch(function(e){
|
||||
status.textContent = 'Error: ' + e;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
}
|
||||
|
||||
function installStreamLog(taskId) {
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
var es = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
es.onmessage = function(e) {
|
||||
term.textContent += e.data + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
};
|
||||
es.addEventListener('done', function(e) {
|
||||
es.close();
|
||||
if (!e.data) {
|
||||
status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
|
||||
var rebootBtn = document.createElement('button');
|
||||
rebootBtn.className = 'btn btn-primary btn-sm';
|
||||
rebootBtn.style.marginLeft = '12px';
|
||||
rebootBtn.textContent = 'Reboot now';
|
||||
rebootBtn.onclick = function(){
|
||||
fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({name:'', action:'reboot'})});
|
||||
};
|
||||
status.appendChild(rebootBtn);
|
||||
} else {
|
||||
status.textContent = '✗ Installation failed: ' + e.data;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
}
|
||||
});
|
||||
es.onerror = function() {
|
||||
es.close();
|
||||
status.textContent = '✗ Stream disconnected.';
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
};
|
||||
}
|
||||
|
||||
installRefreshDisks();
|
||||
</script>
|
||||
`
|
||||
}
|
||||
|
||||
func renderInstall() string {
|
||||
return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
|
||||
renderInstallInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
|
||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||
</div>
|
||||
<script>
|
||||
var _taskRefreshTimer = null;
|
||||
var _tasksAll = [];
|
||||
var _taskPage = 1;
|
||||
var _taskPageSize = 50;
|
||||
|
||||
function loadTasks() {
|
||||
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||
if (_tasksAll.length === 0) {
|
||||
_taskPage = 1;
|
||||
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||
return;
|
||||
}
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||
if (_taskPage < 1) _taskPage = 1;
|
||||
const start = (_taskPage - 1) * _taskPageSize;
|
||||
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||
const rows = pageTasks.map(t => {
|
||||
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||
let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
|
||||
if (t.status === 'running' || t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
|
||||
}
|
||||
if (t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">⇧</button>';
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">⇩</button>';
|
||||
}
|
||||
return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
|
||||
'<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
|
||||
'<td>'+t.priority+'</td>' +
|
||||
'<td>'+actions+'</td></tr>';
|
||||
}).join('');
|
||||
const showingFrom = start + 1;
|
||||
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||
const pager =
|
||||
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||
'</div>' +
|
||||
'</div>';
|
||||
document.getElementById('tasks-table').innerHTML =
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||
});
|
||||
}
|
||||
|
||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||
function formatDurSec(sec) {
|
||||
sec = Math.max(0, Math.round(sec||0));
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
}
|
||||
function setTaskPage(page) {
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||
loadTasks();
|
||||
}
|
||||
|
||||
function cancelTask(id) {
|
||||
fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function killWorkers() {
|
||||
if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
|
||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||
.then(r=>r.json())
|
||||
.then(d=>{
|
||||
loadTasks();
|
||||
var toast = document.getElementById('kill-toast');
|
||||
var parts = [];
|
||||
if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
|
||||
if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
|
||||
toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
|
||||
toast.style.display = '';
|
||||
setTimeout(()=>{ toast.style.display='none'; }, 5000);
|
||||
});
|
||||
}
|
||||
function setPriority(id, delta) {
|
||||
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||
.then(()=>loadTasks());
|
||||
}
|
||||
|
||||
loadTasks();
|
||||
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||
</script>`
|
||||
}
|
||||
238
audit/internal/webui/page_metrics.go
Normal file
238
audit/internal/webui/page_metrics.go
Normal file
@@ -0,0 +1,238 @@
|
||||
package webui
|
||||
|
||||
func renderMetrics() string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Temperature — CPU</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Temperature — Ambient Sensors</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
|
||||
<div class="card-head">Server — Fan RPM</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
|
||||
<div>
|
||||
<div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
|
||||
<div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
|
||||
</div>
|
||||
<label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
|
||||
<input id="gpu-chart-toggle" type="checkbox">
|
||||
<span>One chart per GPU</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div id="gpu-metrics-by-metric">
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Compute Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Memory Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Core Clock</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Temperature</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="gpu-metrics-by-gpu" style="display:none"></div>
|
||||
</section>
|
||||
|
||||
<script>
|
||||
let gpuChartKey = '';
|
||||
const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
|
||||
let metricsNvidiaGPUsPromise = null;
|
||||
|
||||
function loadMetricsNvidiaGPUs() {
|
||||
if (!metricsNvidiaGPUsPromise) {
|
||||
metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
})
|
||||
.then(function(list) { return Array.isArray(list) ? list : []; })
|
||||
.catch(function() { return []; });
|
||||
}
|
||||
return metricsNvidiaGPUsPromise;
|
||||
}
|
||||
|
||||
function metricsGPUNameMap(list) {
|
||||
const out = {};
|
||||
(list || []).forEach(function(gpu) {
|
||||
const idx = Number(gpu.index);
|
||||
if (!Number.isFinite(idx) || !gpu.name) return;
|
||||
out[idx] = gpu.name;
|
||||
});
|
||||
return out;
|
||||
}
|
||||
|
||||
function metricsGPUDisplayLabel(idx, names) {
|
||||
const name = names && names[idx];
|
||||
return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
|
||||
}
|
||||
|
||||
function loadGPUChartModePreference() {
|
||||
try {
|
||||
return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function saveGPUChartModePreference(perGPU) {
|
||||
try {
|
||||
sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
function refreshChartImage(el) {
|
||||
if (!el || el.dataset.loading === '1') return;
|
||||
if (el.offsetParent === null) return;
|
||||
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||
const probe = new Image();
|
||||
el.dataset.baseSrc = baseSrc;
|
||||
el.dataset.loading = '1';
|
||||
probe.onload = function() {
|
||||
el.src = nextSrc;
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.onerror = function() {
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.src = nextSrc;
|
||||
}
|
||||
|
||||
function refreshCharts() {
|
||||
document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
|
||||
}
|
||||
|
||||
function gpuIndices(rows) {
|
||||
const seen = {};
|
||||
const out = [];
|
||||
(rows || []).forEach(function(row) {
|
||||
const idx = Number(row.index);
|
||||
if (!Number.isFinite(idx) || seen[idx]) return;
|
||||
seen[idx] = true;
|
||||
out.push(idx);
|
||||
});
|
||||
return out.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
|
||||
function renderGPUOverviewCards(indices, names) {
|
||||
const host = document.getElementById('gpu-metrics-by-gpu');
|
||||
if (!host) return;
|
||||
host.innerHTML = indices.map(function(idx) {
|
||||
const label = metricsGPUDisplayLabel(idx, names);
|
||||
return '<div class="card" style="margin-bottom:16px">' +
|
||||
'<div class="card-head">' + label + ' — Overview</div>' +
|
||||
'<div class="card-body" style="padding:8px">' +
|
||||
'<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
|
||||
'</div></div>';
|
||||
}).join('');
|
||||
}
|
||||
|
||||
function applyGPUChartMode() {
|
||||
const perMetric = document.getElementById('gpu-metrics-by-metric');
|
||||
const perGPU = document.getElementById('gpu-metrics-by-gpu');
|
||||
const toggle = document.getElementById('gpu-chart-toggle');
|
||||
const gpuModePerGPU = !!(toggle && toggle.checked);
|
||||
if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
|
||||
if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
|
||||
}
|
||||
|
||||
function syncMetricsLayout(d) {
|
||||
const fanCard = document.getElementById('card-server-fans');
|
||||
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
|
||||
const section = document.getElementById('gpu-metrics-section');
|
||||
const summary = document.getElementById('gpu-metrics-summary');
|
||||
const indices = gpuIndices(d.gpus);
|
||||
loadMetricsNvidiaGPUs().then(function(gpus) {
|
||||
const names = metricsGPUNameMap(gpus);
|
||||
if (section) section.style.display = indices.length > 0 ? '' : 'none';
|
||||
if (summary) {
|
||||
summary.textContent = indices.length > 0
|
||||
? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
|
||||
: 'No GPUs detected in live metrics.';
|
||||
}
|
||||
const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
|
||||
if (nextKey !== gpuChartKey) {
|
||||
renderGPUOverviewCards(indices, names);
|
||||
gpuChartKey = nextKey;
|
||||
}
|
||||
applyGPUChartMode();
|
||||
});
|
||||
}
|
||||
|
||||
function loadMetricsLayout() {
|
||||
fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
|
||||
}
|
||||
|
||||
const gpuChartToggle = document.getElementById('gpu-chart-toggle');
|
||||
if (gpuChartToggle) {
|
||||
gpuChartToggle.checked = loadGPUChartModePreference();
|
||||
}
|
||||
applyGPUChartMode();
|
||||
|
||||
if (gpuChartToggle) {
|
||||
gpuChartToggle.addEventListener('change', function() {
|
||||
saveGPUChartModePreference(!!gpuChartToggle.checked);
|
||||
applyGPUChartMode();
|
||||
refreshCharts();
|
||||
});
|
||||
}
|
||||
|
||||
loadMetricsLayout();
|
||||
setInterval(refreshCharts, 3000);
|
||||
setInterval(loadMetricsLayout, 5000);
|
||||
</script>`
|
||||
}
|
||||
213
audit/internal/webui/page_network_services.go
Normal file
213
audit/internal/webui/page_network_services.go
Normal file
@@ -0,0 +1,213 @@
|
||||
package webui
|
||||
|
||||
import "html"
|
||||
|
||||
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
|
||||
func renderNetworkInline() string {
|
||||
return `<div id="net-pending" style="display:none" class="alert alert-warn">
|
||||
<strong>⚠ Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
|
||||
<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
|
||||
<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
|
||||
</div>
|
||||
<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div class="grid2" style="margin-top:16px">
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
|
||||
<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
|
||||
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||
<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div>
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
|
||||
<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
|
||||
<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
|
||||
<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
|
||||
<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
|
||||
<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
|
||||
<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
|
||||
<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
var _netCountdownTimer = null;
|
||||
var _netRefreshTimer = null;
|
||||
const NET_ROLLBACK_SECS = 60;
|
||||
function loadNetwork() {
|
||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||
const rows = (d.interfaces||[]).map(i =>
|
||||
'<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
|
||||
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
||||
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||
).join('');
|
||||
document.getElementById('iface-table').innerHTML =
|
||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
else hideNetPending();
|
||||
}).catch(function() {});
|
||||
}
|
||||
function selectIface(iface) {
|
||||
document.getElementById('dhcp-iface').value = iface;
|
||||
document.getElementById('st-iface').value = iface;
|
||||
}
|
||||
function toggleIface(iface, currentState) {
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||
loadNetwork();
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function hideNetPending() {
|
||||
const el = document.getElementById('net-pending');
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
_netCountdownTimer = null;
|
||||
el.style.display = 'none';
|
||||
}
|
||||
function showNetPending(secs) {
|
||||
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||
const el = document.getElementById('net-pending');
|
||||
el.style.display = 'block';
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
let remaining = secs;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
_netCountdownTimer = setInterval(function() {
|
||||
remaining--;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||
}, 1000);
|
||||
}
|
||||
function confirmNetChange() {
|
||||
hideNetPending();
|
||||
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function rollbackNetChange() {
|
||||
hideNetPending();
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function runDHCP() {
|
||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function setStatic() {
|
||||
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||
interface: document.getElementById('st-iface').value,
|
||||
address: document.getElementById('st-addr').value,
|
||||
prefix: document.getElementById('st-prefix').value,
|
||||
gateway: document.getElementById('st-gw').value,
|
||||
dns: dns,
|
||||
})}).then(r=>r.json()).then(d => {
|
||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
loadNetwork();
|
||||
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNetwork() string {
|
||||
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
|
||||
renderNetworkInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="svc-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function loadServices() {
|
||||
fetch('/api/services').then(r=>r.json()).then(svcs => {
|
||||
const rows = svcs.map(s => {
|
||||
const st = s.state||'unknown';
|
||||
const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
|
||||
const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
|
||||
const body = (s.body||'').replace(/</g,'<').replace(/>/g,'>');
|
||||
return '<tr>' +
|
||||
'<td style="white-space:nowrap">'+s.name+'</td>' +
|
||||
'<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
|
||||
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
||||
'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||
'</td></tr>';
|
||||
}).join('');
|
||||
document.getElementById('svc-table').innerHTML =
|
||||
'<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
function toggleBody(id) {
|
||||
const el = document.getElementById(id);
|
||||
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||
}
|
||||
function svcAction(btn, name, action) {
|
||||
var label = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = '...';
|
||||
var out = document.getElementById('svc-out');
|
||||
var term = document.getElementById('svc-terminal');
|
||||
var statusEl = document.getElementById('svc-out-status');
|
||||
var labelEl = document.getElementById('svc-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = action + ' ' + name;
|
||||
term.textContent = 'Running...';
|
||||
statusEl.textContent = '';
|
||||
statusEl.style.color = '';
|
||||
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
term.textContent = d.output || d.error || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (d.status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
btn.textContent = label;
|
||||
btn.disabled = false;
|
||||
setTimeout(loadServices, 800);
|
||||
}).catch(e => {
|
||||
term.textContent = 'Request failed: ' + e;
|
||||
statusEl.textContent = '✗ error';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
btn.textContent = label;
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadServices();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderServices() string {
|
||||
return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
|
||||
renderServicesInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
115
audit/internal/webui/page_settings.go
Normal file
115
audit/internal/webui/page_settings.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package webui
|
||||
|
||||
import "html"
|
||||
|
||||
func renderSettings(opts HandlerOptions) string {
|
||||
version := opts.BuildLabel
|
||||
if version == "" {
|
||||
version = "dev"
|
||||
}
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let kind = d.kind || 'unknown';
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let label = kind==='ram'?'RAM':kind==='usb'?'USB ('+source+')':kind==='cdrom'?'CD-ROM ('+source+')':kind==='disk'?'disk ('+source+')':source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.blocked_reason || d.message || 'Checking...';
|
||||
txt.style.color = (d.status==='ok'||d.in_ram)?'var(--ok,green)':d.status==='failed'?'var(--err,#b91c1c)':'var(--muted)';
|
||||
if (d.can_start_task) { btn.style.display=''; btn.disabled=false; } else { btn.style.display='none'; }
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK?'badge-ok':'badge-err')+'">'+(t.OK?'✓ '+t.Path:'✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML = '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
checkTools();
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Build Info</div>
|
||||
<div class="card-body">
|
||||
<table style="width:auto">
|
||||
<tbody>
|
||||
<tr><td style="color:var(--muted);padding-right:24px">Version</td><td>` + html.EscapeString(version) + `</td></tr>
|
||||
<tr><td style="color:var(--muted);padding-right:24px">Title</td><td>` + html.EscapeString(opts.Title) + `</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Power</div>
|
||||
<div class="card-body">
|
||||
<div style="display:flex;gap:8px;align-items:center">
|
||||
<button class="btn btn-secondary btn-sm" onclick="systemPower('reboot')">Reboot</button>
|
||||
<button class="btn btn-secondary btn-sm" onclick="systemPower('shutdown')">Shutdown</button>
|
||||
<span id="power-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
function systemPower(action) {
|
||||
var label = action === 'reboot' ? 'reboot' : 'shut down';
|
||||
if (!confirm('Are you sure you want to ' + label + ' the server?')) return;
|
||||
var el = document.getElementById('power-status');
|
||||
if (el) el.textContent = action === 'reboot' ? 'Rebooting...' : 'Shutting down...';
|
||||
fetch('/api/system/' + action, {method: 'POST'})
|
||||
.then(function(r) { return r.json(); })
|
||||
.catch(function(e) { if (el) el.textContent = 'Error: ' + e.message; });
|
||||
}
|
||||
</script>
|
||||
|
||||
`
|
||||
}
|
||||
935
audit/internal/webui/page_validate.go
Normal file
935
audit/internal/webui/page_validate.go
Normal file
@@ -0,0 +1,935 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz).
|
||||
const (
|
||||
pciVendorNvidia = 0x10de
|
||||
pciVendorAMD = 0x1002
|
||||
pciVendorAspeed = 0x1a03
|
||||
)
|
||||
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
Storage string
|
||||
NVIDIA string
|
||||
AMD string
|
||||
NvidiaGPUCount int
|
||||
AMDGPUCount int
|
||||
}
|
||||
|
||||
func validateFmtDur(secs int) string {
|
||||
if secs < 120 {
|
||||
return fmt.Sprintf("~%d s", secs)
|
||||
}
|
||||
mins := (secs + 29) / 60
|
||||
return fmt.Sprintf("~%d min", mins)
|
||||
}
|
||||
|
||||
func validateTotalValidateSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUValidateSec +
|
||||
platform.SATEstimatedMemoryValidateSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
if n > 0 {
|
||||
total += platform.SATEstimatedNvidiaGPUValidateSec
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func validateTotalStressSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUStressSec +
|
||||
platform.SATEstimatedMemoryStressSec +
|
||||
platform.SATEstimatedNvidiaPulseTestSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
if n > 0 {
|
||||
total += platform.SATEstimatedNvidiaGPUStressSec +
|
||||
platform.SATEstimatedNvidiaTargetedStressSec +
|
||||
platform.SATEstimatedNvidiaTargetedPowerSec
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
return renderValidateMode(opts, false)
|
||||
}
|
||||
|
||||
func renderValidateStress(opts HandlerOptions) string {
|
||||
return renderValidateMode(opts, true)
|
||||
}
|
||||
|
||||
func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
estStr := validateTotalStr
|
||||
if stressDefault {
|
||||
estStr = stressTotalStr
|
||||
}
|
||||
alert := `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>`
|
||||
if stressDefault {
|
||||
alert = `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Stress mode:</strong> Runs extended load tests — CPU stress-ng, memory passes, DCGM targeted diagnostics. Higher wear than Validate.</div>`
|
||||
}
|
||||
|
||||
stressOnlyCards := ""
|
||||
if stressDefault {
|
||||
stressOnlyCards = renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`,
|
||||
))
|
||||
}
|
||||
|
||||
satStressModeJS := "function satStressMode() { return false; }"
|
||||
if stressDefault {
|
||||
satStressModeJS = "function satStressMode() { return true; }"
|
||||
}
|
||||
|
||||
return alert + `
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Run All</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">est. ` + estStr + gpuNote + `</span>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
</div>
|
||||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||
)) +
|
||||
stressOnlyCards +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
` + satStressModeJS + `
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
if (!satNvidiaGPUsPromise) {
|
||||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||
.then(r => {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
})
|
||||
.then(list => Array.isArray(list) ? list : []);
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row">'
|
||||
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectNoGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satLoadGPUs() {
|
||||
loadSatNvidiaGPUs().then(function(gpus) {
|
||||
satRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) {
|
||||
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
}
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satGPUDisplayName(gpu) {
|
||||
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||
return 'GPU ' + idx + ' — ' + name;
|
||||
}
|
||||
function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
body.stress_mode = satStressMode();
|
||||
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||
if (overrides) {
|
||||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||
}
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||
.then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(function(resolve) {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', function(e) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = function() {
|
||||
if (satES) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) {
|
||||
return runSATWithOverrides(target, null);
|
||||
}
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides)
|
||||
.then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
const nvidiaPerGPUTargets = [];
|
||||
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
function satAllGPUIndicesForMulti() {
|
||||
return Promise.resolve(satSelectedGPUIndices());
|
||||
}
|
||||
function expandSATTarget(target) {
|
||||
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||
});
|
||||
}
|
||||
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||
return Promise.resolve([{target: target}]);
|
||||
}
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||||
target: target,
|
||||
overrides: {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||
},
|
||||
label: satGPUDisplayName(gpu),
|
||||
})));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Running AMD validate set one by one...\n';
|
||||
const labels = satLabels();
|
||||
const runNext = (idx) => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const target = targets[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||||
return enqueueSATTarget(target)
|
||||
.then(d => {
|
||||
return streamSATTask(d.task_id, labels[target], false);
|
||||
}).then(function() {
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = 1;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
return !(btn && btn.disabled);
|
||||
});
|
||||
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
|
||||
const expanded = [];
|
||||
for (let cycle = 0; cycle < cycles; cycle++) {
|
||||
groups.forEach(group => group.forEach(item => expanded.push(item)));
|
||||
}
|
||||
const total = expanded.length;
|
||||
let enqueued = 0;
|
||||
if (!total) {
|
||||
status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const runNext = (idx) => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides)
|
||||
.then(() => {
|
||||
enqueued++;
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}).catch(err => {
|
||||
status.textContent = 'Error: ' + err.message;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||
});
|
||||
satLoadGPUs();
|
||||
function disableSATAMDOptions(reason) {
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||
const cb = document.getElementById(id);
|
||||
if (!cb) return;
|
||||
cb.disabled = true;
|
||||
cb.checked = false;
|
||||
cb.title = reason;
|
||||
});
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true;
|
||||
btn.title = reason;
|
||||
btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||||
unknown := "Audit snapshot not loaded."
|
||||
out := validateInventory{
|
||||
CPU: unknown,
|
||||
Memory: unknown,
|
||||
Storage: unknown,
|
||||
NVIDIA: unknown,
|
||||
AMD: unknown,
|
||||
}
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &snap); err != nil {
|
||||
return out
|
||||
}
|
||||
|
||||
cpuCounts := map[string]int{}
|
||||
cpuTotal := 0
|
||||
for _, cpu := range snap.Hardware.CPUs {
|
||||
if cpu.Present != nil && !*cpu.Present {
|
||||
continue
|
||||
}
|
||||
cpuTotal++
|
||||
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
memCounts := map[string]int{}
|
||||
memTotal := 0
|
||||
for _, dimm := range snap.Hardware.Memory {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
memTotal++
|
||||
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
storageCounts := map[string]int{}
|
||||
storageTotal := 0
|
||||
for _, dev := range snap.Hardware.Storage {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
storageTotal++
|
||||
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
nvidiaCounts := map[string]int{}
|
||||
nvidiaTotal := 0
|
||||
amdCounts := map[string]int{}
|
||||
amdTotal := 0
|
||||
for _, dev := range snap.Hardware.PCIeDevices {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
if validateIsVendorGPU(dev, "nvidia") {
|
||||
nvidiaTotal++
|
||||
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
if validateIsVendorGPU(dev, "amd") {
|
||||
amdTotal++
|
||||
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
}
|
||||
|
||||
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
|
||||
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
|
||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||
out.NvidiaGPUCount = nvidiaTotal
|
||||
out.AMDGPUCount = amdTotal
|
||||
return out
|
||||
}
|
||||
|
||||
func renderValidateCardBody(devices, description, commands, settings string) string {
|
||||
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
|
||||
}
|
||||
|
||||
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
|
||||
if total == 0 {
|
||||
return "0 " + unit + "s detected."
|
||||
}
|
||||
keys := make([]string, 0, len(models))
|
||||
for key := range models {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
|
||||
}
|
||||
label := unit
|
||||
if total != 1 {
|
||||
label += "s"
|
||||
}
|
||||
if len(parts) == 1 {
|
||||
return parts[0] + " " + label
|
||||
}
|
||||
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||
}
|
||||
|
||||
func addValidateModel(counts map[string]int, name string) {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
name = "unknown"
|
||||
}
|
||||
counts[name]++
|
||||
}
|
||||
|
||||
func validateTrimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func validateFirstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed {
|
||||
return false
|
||||
}
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia
|
||||
case "amd":
|
||||
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// renderCheck renders the non-destructive Check page (step 2).
|
||||
// Shows validate-mode tests only: CPU, Memory, Storage, NVIDIA L2, NCCL, NVBandwidth, AMD.
|
||||
// Stress-mode tests (targeted-stress, targeted-power, pulse) are on the Load page.
|
||||
func renderCheck(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/burn">4. Burn</a>.</div>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllCheckSAT()">Run All Checks</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">est. ` + validateTotalStr + gpuNote + `</span>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
</div>
|
||||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA check tasks.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks (DCGM Level 2).`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs AMD GPU inventory, MEM integrity, and MEM bandwidth checks.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satLabels() {
|
||||
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
if (!satNvidiaGPUsPromise) {
|
||||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia').then(r => {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
}).then(list => Array.isArray(list) ? list : []);
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(el => el.checked && !el.disabled)
|
||||
.map(el => parseInt(el.value, 10))
|
||||
.filter(v => !Number.isNaN(v))
|
||||
.sort((a, b) => a - b);
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const sel = satSelectedGPUIndices();
|
||||
note.textContent = sel.length
|
||||
? 'Selected GPUs: ' + sel.join(', ') + '. Multi-GPU tests will use all selected GPUs.'
|
||||
: 'Select at least one NVIDIA GPU to enable NVIDIA check tasks.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote(); return;
|
||||
}
|
||||
root.innerHTML = gpus.map(gpu => {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row"><input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()"><span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span></label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = true; }); satUpdateGPUSelectionNote(); }
|
||||
function satSelectNoGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = false; }); satUpdateGPUSelectionNote(); }
|
||||
function satGPULoadInit() {
|
||||
loadSatNvidiaGPUs().then(satRenderGPUList).catch(err => {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Check ' + target);
|
||||
body.stress_mode = false;
|
||||
if (target === 'cpu') body.duration = 60;
|
||||
if (overrides) Object.keys(overrides).forEach(k => { body[k] = overrides[k]; });
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/' + target + '/run', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(satRequestBody(target, overrides))}).then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) term.textContent = '';
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(resolve => {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', e => {
|
||||
satES.close(); satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = () => {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) { return runSATWithOverrides(target, null); }
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides).then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
const indices = satSelectedGPUIndices();
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
const sel = satSelectedGPUIndices();
|
||||
if (!sel.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||
return runSATWithOverrides(target, {gpu_indices: sel, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
const term = document.getElementById('sat-terminal');
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
term.textContent = 'Running AMD check set...\n';
|
||||
const labels = satLabels();
|
||||
const runNext = idx => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const t = targets[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[t] + '\n';
|
||||
return enqueueSATTarget(t).then(d => streamSATTask(d.task_id, labels[t], false)).then(() => runNext(idx + 1));
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllCheckSAT() {
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const nvidiaIndices = satSelectedGPUIndices();
|
||||
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const baseTargets = ['cpu', 'memory', 'storage'];
|
||||
const amdTargets = selectedAMDValidateTargets();
|
||||
const expanded = [];
|
||||
baseTargets.forEach(t => expanded.push({target: t}));
|
||||
if (nvidiaIndices.length) {
|
||||
nvidiaAllTargets.forEach(t => {
|
||||
const btn = document.getElementById('sat-btn-' + t);
|
||||
if (!(btn && btn.disabled)) expanded.push({target: t, overrides: {gpu_indices: nvidiaIndices, display_name: satLabels()[t] || t}});
|
||||
});
|
||||
}
|
||||
amdTargets.forEach(t => expanded.push({target: t}));
|
||||
if (!expanded.length) { status.textContent = 'No tasks selected.'; return; }
|
||||
const total = expanded.length;
|
||||
const runNext = idx => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides).then(() => runNext(idx + 1));
|
||||
};
|
||||
runNext(0).catch(err => { status.textContent = 'Error: ' + err.message; });
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true; btn.title = reason; btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
|
||||
if (!gp.nvidia) ['nvidia','nvidia-interconnect','nvidia-bandwidth'].forEach(t => disableSATCard(t, 'No NVIDIA GPU detected'));
|
||||
if (!gp.amd) {
|
||||
disableSATCard('amd', 'No AMD GPU detected');
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(id => {
|
||||
const cb = document.getElementById(id);
|
||||
if (cb) { cb.disabled = true; cb.checked = false; }
|
||||
});
|
||||
}
|
||||
});
|
||||
satGPULoadInit();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||
if strings.TrimSpace(headerActions) != "" {
|
||||
actions += headerActions
|
||||
}
|
||||
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
|
||||
label, actions, body)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user